diff --git a/thirdparty/aom/xmake.lua b/thirdparty/aom/xmake.lua new file mode 100644 index 0000000..74233d5 --- /dev/null +++ b/thirdparty/aom/xmake.lua @@ -0,0 +1,23 @@ +package("aom") + + set_homepage("https://aomedia.googlesource.com/aom/") + set_description("AV1 Codec Library") + set_license("BSD-3-Clause") + set_urls("https://aomedia.googlesource.com/aom.git") + add_versions("v3.9.0", "6cab58c3925e0f4138e15a4ed510161ea83b6db1") + + add_deps("cmake") + + if is_os("windows") then + add_defines("_CRT_SECURE_NO_WARNINGS") + end + + on_install("windows", "linux", "macosx", function (package) + local configs = {"-DENABLE_EXAMPLES=OFF", "-DENABLE_TESTS=OFF", "-DENABLE_TOOLS=OFF", "-DENABLE_DOCS=OFF"} + table.insert(configs, "-DCMAKE_BUILD_TYPE=" .. (package:debug() and "Debug" or "Release")) + import("package.tools.cmake").install(package, configs) + end) + + on_test(function (package) + assert(package:has_cfuncs("aom_codec_version", {includes = "aom/aom_codec.h"})) + end) \ No newline at end of file diff --git a/thirdparty/libyuv/.clang-format b/thirdparty/libyuv/.clang-format deleted file mode 100644 index 59d4870..0000000 --- a/thirdparty/libyuv/.clang-format +++ /dev/null @@ -1,6 +0,0 @@ -# Defines the Chromium style for automatic reformatting. -# http://clang.llvm.org/docs/ClangFormatStyleOptions.html -BasedOnStyle: Chromium ---- -Language: Java -BasedOnStyle: Google diff --git a/thirdparty/libyuv/.gitignore b/thirdparty/libyuv/.gitignore deleted file mode 100644 index 7095d41..0000000 --- a/thirdparty/libyuv/.gitignore +++ /dev/null @@ -1,36 +0,0 @@ -*.pyc -.landmines -pin-log.txt -/base -/build -/buildtools -/google_apis -/links -/links.db -/ios -/mojo -/native_client -/net -/out -/source/out -/sde-avx-sse-transition-out.txt -/testing -/third_party -/tools - -# Files generated by CMake build -cmake_install.cmake -CMakeCache.txt -CMakeFiles/ -yuvconvert -libgtest.a -libyuv.a -libyuv_unittest - -# Files generated by winarm.mk build -libyuv_arm.lib -source/*.o - -# Files generated by perf -perf.data -perf.data.old diff --git a/thirdparty/libyuv/.gn b/thirdparty/libyuv/.gn deleted file mode 100644 index be8c3b5..0000000 --- a/thirdparty/libyuv/.gn +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2015 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -import("//build/dotfile_settings.gni") - -# The location of the build configuration file. -buildconfig = "//build/config/BUILDCONFIG.gn" - -# The secondary source root is a parallel directory tree where -# GN build files are placed when they can not be placed directly -# in the source tree, e.g. for third party source trees. -secondary_source = "//build/secondary/" - -# These are the targets to check headers for by default. The files in targets -# matching these patterns (see "gn help label_pattern" for format) will have -# their includes checked for proper dependencies when you run either -# "gn check" or "gn gen --check". -check_targets = [ "//libyuv/*" ] - -# These are the list of GN files that run exec_script. This whitelist exists -# to force additional review for new uses of exec_script, which is strongly -# discouraged except for gypi_to_gn calls. -exec_script_whitelist = build_dotfile_settings.exec_script_whitelist + - [ "//build_overrides/build.gni" ] - -default_args = { - mac_sdk_min = "10.12" - - # https://bugs.chromium.org/p/libyuv/issues/detail?id=826 - ios_deployment_target = "10.0" -} diff --git a/thirdparty/libyuv/.vpython b/thirdparty/libyuv/.vpython deleted file mode 100644 index e0aaf89..0000000 --- a/thirdparty/libyuv/.vpython +++ /dev/null @@ -1,59 +0,0 @@ -# This is a vpython "spec" file. -# -# It describes patterns for python wheel dependencies of the python scripts in -# the chromium repo, particularly for dependencies that have compiled components -# (since pure-python dependencies can be easily vendored into third_party). -# -# When vpython is invoked, it finds this file and builds a python VirtualEnv, -# containing all of the dependencies described in this file, fetching them from -# CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`, -# this never requires the end-user machine to have a working python extension -# compilation environment. All of these packages are built using: -# https://chromium.googlesource.com/infra/infra/+/master/infra/tools/dockerbuild/ -# -# All python scripts in the repo share this same spec, to avoid dependency -# fragmentation. -# -# If you have depot_tools installed in your $PATH, you can invoke python scripts -# in this repo by running them as you normally would run them, except -# substituting `vpython` instead of `python` on the command line, e.g.: -# vpython path/to/script.py some --arguments -# -# Read more about `vpython` and how to modify this file here: -# https://chromium.googlesource.com/infra/infra/+/master/doc/users/vpython.md - -python_version: "2.7" - -# Used by: -# third_party/catapult -wheel: < - name: "infra/python/wheels/psutil/${platform}_${py_python}_${py_abi}" - version: "version:5.2.2" -> - -# Used by: -# third_party/catapult -wheel: < - name: "infra/python/wheels/pypiwin32/${vpython_platform}" - version: "version:219" - match_tag: < - platform: "win32" - > - match_tag: < - platform: "win_amd64" - > -> - -# Used by: -# tools/swarming_client -wheel: < - name: "infra/python/wheels/six-py2_py3" - version: "version:1.15.0" -> - -# Used by: -# build/android -wheel: < - name: "infra/python/wheels/requests-py2_py3" - version: "version:2.13.0" -> diff --git a/thirdparty/libyuv/AUTHORS b/thirdparty/libyuv/AUTHORS deleted file mode 100644 index 9686ac1..0000000 --- a/thirdparty/libyuv/AUTHORS +++ /dev/null @@ -1,4 +0,0 @@ -# Names should be added to this file like so: -# Name or Organization - -Google Inc. diff --git a/thirdparty/libyuv/Android.bp b/thirdparty/libyuv/Android.bp deleted file mode 100644 index ce1f62e..0000000 --- a/thirdparty/libyuv/Android.bp +++ /dev/null @@ -1,156 +0,0 @@ -cc_library { - name: "libyuv", - vendor_available: true, - vndk: { - enabled: true, - }, - - srcs: [ - "source/compare.cc", - "source/compare_common.cc", - "source/compare_gcc.cc", - "source/compare_mmi.cc", - "source/compare_msa.cc", - "source/compare_neon.cc", - "source/compare_neon64.cc", - "source/convert.cc", - "source/convert_argb.cc", - "source/convert_from.cc", - "source/convert_from_argb.cc", - "source/convert_jpeg.cc", - "source/convert_to_argb.cc", - "source/convert_to_i420.cc", - "source/cpu_id.cc", - "source/mjpeg_decoder.cc", - "source/mjpeg_validate.cc", - "source/planar_functions.cc", - "source/rotate.cc", - "source/rotate_any.cc", - "source/rotate_argb.cc", - "source/rotate_common.cc", - "source/rotate_gcc.cc", - "source/rotate_mmi.cc", - "source/rotate_msa.cc", - "source/rotate_neon.cc", - "source/rotate_neon64.cc", - "source/row_any.cc", - "source/row_common.cc", - "source/row_gcc.cc", - "source/row_mmi.cc", - "source/row_msa.cc", - "source/row_neon.cc", - "source/row_neon64.cc", - "source/scale.cc", - "source/scale_any.cc", - "source/scale_argb.cc", - "source/scale_common.cc", - "source/scale_gcc.cc", - "source/scale_mmi.cc", - "source/scale_msa.cc", - "source/scale_neon.cc", - "source/scale_neon64.cc", - "source/scale_uv.cc", - "source/video_common.cc", - ], - - cflags: [ - "-Wall", - "-Werror", - "-Wno-unused-parameter", - "-fexceptions", - "-DHAVE_JPEG", - ], - - shared_libs: ["libjpeg"], - - export_include_dirs: ["include"], -} - -// compatibilty static library until all uses of libyuv_static are replaced -// with libyuv (b/37646797) -cc_library_static { - name: "libyuv_static", - vendor_available: true, - whole_static_libs: ["libyuv"], -} - -cc_test { - name: "libyuv_unittest", - static_libs: ["libyuv"], - shared_libs: ["libjpeg"], - cflags: ["-Wall", "-Werror"], - srcs: [ - "unit_test/basictypes_test.cc", - "unit_test/color_test.cc", - "unit_test/compare_test.cc", - "unit_test/convert_test.cc", - "unit_test/cpu_test.cc", - "unit_test/cpu_thread_test.cc", - "unit_test/math_test.cc", - "unit_test/planar_test.cc", - "unit_test/rotate_argb_test.cc", - "unit_test/rotate_test.cc", - "unit_test/scale_argb_test.cc", - "unit_test/scale_test.cc", - "unit_test/scale_uv_test.cc", - "unit_test/unit_test.cc", - "unit_test/video_common_test.cc", - ], -} - -cc_test { - name: "compare", - gtest: false, - srcs: [ - "util/compare.cc", - ], - static_libs: ["libyuv"], -} - -cc_test { - name: "i444tonv12_eg", - gtest: false, - srcs: [ - "util/i444tonv12_eg.cc", - ], - static_libs: ["libyuv"], -} - -cc_test { - name: "cpuid", - gtest: false, - srcs: [ - "util/cpuid.c", - ], - static_libs: ["libyuv"], -} - -cc_test { - name: "psnr", - gtest: false, - srcs: [ - "util/psnr_main.cc", - "util/psnr.cc", - "util/ssim.cc", - ], - static_libs: ["libyuv"], -} - -cc_test { - name: "yuvconvert", - gtest: false, - srcs: [ - "util/yuvconvert.cc", - ], - static_libs: ["libyuv"], - shared_libs: ["libjpeg"], -} - -cc_test { - name: "yuvconstants", - gtest: false, - srcs: [ - "util/yuvconstants.c", - ], - static_libs: ["libyuv"], -} diff --git a/thirdparty/libyuv/Android.mk b/thirdparty/libyuv/Android.mk deleted file mode 100644 index 2ceb492..0000000 --- a/thirdparty/libyuv/Android.mk +++ /dev/null @@ -1,110 +0,0 @@ -# This is the Android makefile for libyuv for NDK. -LOCAL_PATH:= $(call my-dir) - -include $(CLEAR_VARS) - -LOCAL_CPP_EXTENSION := .cc - -LOCAL_SRC_FILES := \ - source/compare.cc \ - source/compare_common.cc \ - source/compare_gcc.cc \ - source/compare_mmi.cc \ - source/compare_msa.cc \ - source/compare_neon.cc \ - source/compare_neon64.cc \ - source/compare_win.cc \ - source/convert.cc \ - source/convert_argb.cc \ - source/convert_from.cc \ - source/convert_from_argb.cc \ - source/convert_to_argb.cc \ - source/convert_to_i420.cc \ - source/cpu_id.cc \ - source/planar_functions.cc \ - source/rotate.cc \ - source/rotate_any.cc \ - source/rotate_argb.cc \ - source/rotate_common.cc \ - source/rotate_gcc.cc \ - source/rotate_mmi.cc \ - source/rotate_msa.cc \ - source/rotate_neon.cc \ - source/rotate_neon64.cc \ - source/rotate_win.cc \ - source/row_any.cc \ - source/row_common.cc \ - source/row_gcc.cc \ - source/row_mmi.cc \ - source/row_msa.cc \ - source/row_neon.cc \ - source/row_neon64.cc \ - source/row_win.cc \ - source/scale.cc \ - source/scale_any.cc \ - source/scale_argb.cc \ - source/scale_common.cc \ - source/scale_gcc.cc \ - source/scale_mmi.cc \ - source/scale_msa.cc \ - source/scale_neon.cc \ - source/scale_neon64.cc \ - source/scale_uv.cc \ - source/scale_win.cc \ - source/video_common.cc - -common_CFLAGS := -Wall -fexceptions -ifneq ($(LIBYUV_DISABLE_JPEG), "yes") -LOCAL_SRC_FILES += \ - source/convert_jpeg.cc \ - source/mjpeg_decoder.cc \ - source/mjpeg_validate.cc -common_CFLAGS += -DHAVE_JPEG -LOCAL_SHARED_LIBRARIES := libjpeg -endif - -LOCAL_CFLAGS += $(common_CFLAGS) -LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include -LOCAL_C_INCLUDES += $(LOCAL_PATH)/include -LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include - -LOCAL_MODULE := libyuv_static -LOCAL_MODULE_TAGS := optional - -include $(BUILD_STATIC_LIBRARY) - -include $(CLEAR_VARS) - -LOCAL_WHOLE_STATIC_LIBRARIES := libyuv_static -LOCAL_MODULE := libyuv -ifneq ($(LIBYUV_DISABLE_JPEG), "yes") -LOCAL_SHARED_LIBRARIES := libjpeg -endif - -include $(BUILD_SHARED_LIBRARY) - -include $(CLEAR_VARS) -LOCAL_STATIC_LIBRARIES := libyuv_static -LOCAL_SHARED_LIBRARIES := libjpeg -LOCAL_MODULE_TAGS := tests -LOCAL_CPP_EXTENSION := .cc -LOCAL_C_INCLUDES += $(LOCAL_PATH)/include -LOCAL_SRC_FILES := \ - unit_test/basictypes_test.cc \ - unit_test/color_test.cc \ - unit_test/compare_test.cc \ - unit_test/convert_test.cc \ - unit_test/cpu_test.cc \ - unit_test/cpu_thread_test.cc \ - unit_test/math_test.cc \ - unit_test/planar_test.cc \ - unit_test/rotate_argb_test.cc \ - unit_test/rotate_test.cc \ - unit_test/scale_argb_test.cc \ - unit_test/scale_test.cc \ - unit_test/scale_uv_test.cc \ - unit_test/unit_test.cc \ - unit_test/video_common_test.cc - -LOCAL_MODULE := libyuv_unittest -include $(BUILD_NATIVE_TEST) diff --git a/thirdparty/libyuv/BUILD.gn b/thirdparty/libyuv/BUILD.gn deleted file mode 100644 index e1c7c1d..0000000 --- a/thirdparty/libyuv/BUILD.gn +++ /dev/null @@ -1,404 +0,0 @@ -# Copyright 2014 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -import("//testing/test.gni") -import("libyuv.gni") - -declare_args() { - # Set to false to disable building with absl flags. - libyuv_use_absl_flags = true - - # When building a shared library using a target in WebRTC or - # Chromium projects that depends on libyuv, setting this flag - # to true makes libyuv symbols visible inside that library. - libyuv_symbols_visible = false -} - -config("libyuv_config") { - include_dirs = [ "include" ] - if (is_android && current_cpu == "arm64") { - ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ] - } - if (is_android && current_cpu != "arm64") { - ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ] - } -} - -# This target is built when no specific target is specified on the command line. -group("default") { - testonly = true - deps = [ ":libyuv" ] - if (libyuv_include_tests) { - deps += [ - ":compare", - ":cpuid", - ":i444tonv12_eg", - ":libyuv_unittest", - ":psnr", - ":yuvconstants", - ":yuvconvert", - ] - } -} - -group("libyuv") { - all_dependent_configs = [ ":libyuv_config" ] - deps = [] - - if (is_win && target_cpu == "x64") { - # Compile with clang in order to get inline assembly - public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ] - } else { - public_deps = [ ":libyuv_internal" ] - } - - if (libyuv_use_neon) { - deps += [ ":libyuv_neon" ] - } - - if (libyuv_use_msa) { - deps += [ ":libyuv_msa" ] - } - - if (libyuv_use_mmi) { - deps += [ ":libyuv_mmi" ] - } - - if (!is_ios && !libyuv_disable_jpeg) { - # Make sure that clients of libyuv link with libjpeg. This can't go in - # libyuv_internal because in Windows x64 builds that will generate a clang - # build of libjpeg, and we don't want two copies. - deps += [ "//third_party:jpeg" ] - } -} - -static_library("libyuv_internal") { - visibility = [ ":*" ] - - sources = [ - # Headers - "include/libyuv.h", - "include/libyuv/basic_types.h", - "include/libyuv/compare.h", - "include/libyuv/convert.h", - "include/libyuv/convert_argb.h", - "include/libyuv/convert_from.h", - "include/libyuv/convert_from_argb.h", - "include/libyuv/cpu_id.h", - "include/libyuv/mjpeg_decoder.h", - "include/libyuv/planar_functions.h", - "include/libyuv/rotate.h", - "include/libyuv/rotate_argb.h", - "include/libyuv/rotate_row.h", - "include/libyuv/row.h", - "include/libyuv/scale.h", - "include/libyuv/scale_argb.h", - "include/libyuv/scale_row.h", - "include/libyuv/scale_uv.h", - "include/libyuv/version.h", - "include/libyuv/video_common.h", - - # Source Files - "source/compare.cc", - "source/compare_common.cc", - "source/compare_gcc.cc", - "source/compare_win.cc", - "source/convert.cc", - "source/convert_argb.cc", - "source/convert_from.cc", - "source/convert_from_argb.cc", - "source/convert_jpeg.cc", - "source/convert_to_argb.cc", - "source/convert_to_i420.cc", - "source/cpu_id.cc", - "source/mjpeg_decoder.cc", - "source/mjpeg_validate.cc", - "source/planar_functions.cc", - "source/rotate.cc", - "source/rotate_any.cc", - "source/rotate_argb.cc", - "source/rotate_common.cc", - "source/rotate_gcc.cc", - "source/rotate_win.cc", - "source/row_any.cc", - "source/row_common.cc", - "source/row_gcc.cc", - "source/row_win.cc", - "source/scale.cc", - "source/scale_any.cc", - "source/scale_argb.cc", - "source/scale_common.cc", - "source/scale_gcc.cc", - "source/scale_uv.cc", - "source/scale_win.cc", - "source/video_common.cc", - ] - - configs += [ ":libyuv_config" ] - defines = [] - deps = [] - - if (libyuv_symbols_visible) { - configs -= [ "//build/config/gcc:symbol_visibility_hidden" ] - configs += [ "//build/config/gcc:symbol_visibility_default" ] - } - - if (!is_ios && !libyuv_disable_jpeg) { - defines += [ "HAVE_JPEG" ] - - # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps - # because in Windows x64 build it will get compiled with clang. - deps += [ "//third_party:jpeg_includes" ] - } - - # Always enable optimization for Release and NaCl builds (to workaround - # crbug.com/538243). - if (!is_debug || is_nacl) { - configs -= [ "//build/config/compiler:default_optimization" ] - - # Enable optimize for speed (-O2) over size (-Os). - configs += [ "//build/config/compiler:optimize_max" ] - } - - # To enable AVX2 or other cpu optimization, pass flag here - if (!is_win) { - cflags = [ - # "-mpopcnt", - # "-mavx2", - # "-mfma", - "-ffp-contract=fast", # Enable fma vectorization for NEON. - ] - } - if (!libyuv_use_mmi) { - defines += [ "LIBYUV_DISABLE_MMI" ] - } -} - -if (libyuv_use_neon) { - static_library("libyuv_neon") { - sources = [ - # ARM Source Files - "source/compare_neon.cc", - "source/compare_neon64.cc", - "source/rotate_neon.cc", - "source/rotate_neon64.cc", - "source/row_neon.cc", - "source/row_neon64.cc", - "source/scale_neon.cc", - "source/scale_neon64.cc", - ] - - deps = [ ":libyuv_internal" ] - - public_configs = [ ":libyuv_config" ] - - # Always enable optimization for Release and NaCl builds (to workaround - # crbug.com/538243). - if (!is_debug) { - configs -= [ "//build/config/compiler:default_optimization" ] - - # Enable optimize for speed (-O2) over size (-Os). - # TODO(fbarchard): Consider optimize_speed which is O3. - configs += [ "//build/config/compiler:optimize_max" ] - } - - if (current_cpu != "arm64") { - configs -= [ "//build/config/compiler:compiler_arm_fpu" ] - cflags = [ "-mfpu=neon" ] - } - } -} - -if (libyuv_use_msa) { - static_library("libyuv_msa") { - sources = [ - # MSA Source Files - "source/compare_msa.cc", - "source/rotate_msa.cc", - "source/row_msa.cc", - "source/scale_msa.cc", - ] - - deps = [ ":libyuv_internal" ] - - public_configs = [ ":libyuv_config" ] - } -} - -if (libyuv_use_mmi) { - static_library("libyuv_mmi") { - sources = [ - # MMI Source Files - "source/compare_mmi.cc", - "source/rotate_mmi.cc", - "source/row_mmi.cc", - "source/scale_mmi.cc", - ] - - deps = [ ":libyuv_internal" ] - - public_configs = [ ":libyuv_config" ] - } -} - -if (libyuv_include_tests) { - config("libyuv_unittest_warnings_config") { - if (!is_win) { - cflags = [ - # TODO(fbarchard): Fix sign and unused variable warnings. - "-Wno-sign-compare", - "-Wno-unused-variable", - ] - } - if (is_win) { - cflags = [ - "/wd4245", # signed/unsigned mismatch - "/wd4189", # local variable is initialized but not referenced - ] - } - } - config("libyuv_unittest_config") { - defines = [ "GTEST_RELATIVE_PATH" ] - } - - test("libyuv_unittest") { - testonly = true - - sources = [ - "unit_test/basictypes_test.cc", - "unit_test/color_test.cc", - "unit_test/compare_test.cc", - "unit_test/convert_test.cc", - "unit_test/cpu_test.cc", - "unit_test/cpu_thread_test.cc", - "unit_test/math_test.cc", - "unit_test/planar_test.cc", - "unit_test/rotate_argb_test.cc", - "unit_test/rotate_test.cc", - "unit_test/scale_argb_test.cc", - "unit_test/scale_test.cc", - "unit_test/scale_uv_test.cc", - "unit_test/unit_test.cc", - "unit_test/unit_test.h", - "unit_test/video_common_test.cc", - ] - - deps = [ - ":libyuv", - "//testing/gtest", - ] - - defines = [] - if (libyuv_use_absl_flags) { - defines += [ "LIBYUV_USE_ABSL_FLAGS" ] - deps += [ - "//third_party/abseil-cpp/absl/flags:flag", - "//third_party/abseil-cpp/absl/flags:parse", - ] - } - - configs += [ ":libyuv_unittest_warnings_config" ] - - public_deps = [ "//testing/gtest" ] - public_configs = [ ":libyuv_unittest_config" ] - - if (is_linux || is_chromeos) { - cflags = [ "-fexceptions" ] - } - if (is_ios) { - configs -= [ "//build/config/compiler:default_symbols" ] - configs += [ "//build/config/compiler:symbols" ] - cflags = [ "-Wno-sometimes-uninitialized" ] - } - if (!is_ios && !libyuv_disable_jpeg) { - defines += [ "HAVE_JPEG" ] - } - if (is_android) { - deps += [ "//testing/android/native_test:native_test_native_code" ] - } - - # TODO(YangZhang): These lines can be removed when high accuracy - # YUV to RGB to Neon is ported. - if ((target_cpu == "armv7" || target_cpu == "armv7s" || - (target_cpu == "arm" && arm_version >= 7) || target_cpu == "arm64") && - (arm_use_neon || arm_optionally_use_neon)) { - defines += [ "LIBYUV_NEON" ] - } - - defines += [ - # Enable the following 3 macros to turn off assembly for specified CPU. - # "LIBYUV_DISABLE_X86", - # "LIBYUV_DISABLE_NEON", - # Enable the following macro to build libyuv as a shared library (dll). - # "LIBYUV_USING_SHARED_LIBRARY" - ] - } - - executable("compare") { - sources = [ - # sources - "util/compare.cc", - ] - deps = [ ":libyuv" ] - if (is_linux || is_chromeos) { - cflags = [ "-fexceptions" ] - } - } - - executable("yuvconvert") { - sources = [ - # sources - "util/yuvconvert.cc", - ] - deps = [ ":libyuv" ] - if (is_linux || is_chromeos) { - cflags = [ "-fexceptions" ] - } - } - - executable("yuvconstants") { - sources = [ - # sources - "util/yuvconstants.c", - ] - deps = [ ":libyuv" ] - if (is_linux || is_chromeos) { - cflags = [ "-fexceptions" ] - } - } - - executable("psnr") { - sources = [ - # sources - "util/psnr.cc", - "util/psnr_main.cc", - "util/ssim.cc", - ] - deps = [ ":libyuv" ] - - if (!is_ios && !libyuv_disable_jpeg) { - defines = [ "HAVE_JPEG" ] - } - } - - executable("i444tonv12_eg") { - sources = [ - # sources - "util/i444tonv12_eg.cc", - ] - deps = [ ":libyuv" ] - } - - executable("cpuid") { - sources = [ - # sources - "util/cpuid.c", - ] - deps = [ ":libyuv" ] - } -} diff --git a/thirdparty/libyuv/CM_linux_packages.cmake b/thirdparty/libyuv/CM_linux_packages.cmake deleted file mode 100644 index 5f676f8..0000000 --- a/thirdparty/libyuv/CM_linux_packages.cmake +++ /dev/null @@ -1,69 +0,0 @@ -# determine the version number from the #define in libyuv/version.h -EXECUTE_PROCESS ( - COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE YUV_VERSION_NUMBER - OUTPUT_STRIP_TRAILING_WHITESPACE ) -SET ( YUV_VER_MAJOR 0 ) -SET ( YUV_VER_MINOR 0 ) -SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} ) -SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} ) -MESSAGE ( "Building ver.: ${YUV_VERSION}" ) - -# is this a 32-bit or 64-bit build? -IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 ) - SET ( YUV_BIT_SIZE 64 ) -ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 ) - SET ( YUV_BIT_SIZE 32 ) -ELSE () - MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" ) -ENDIF () - -# detect if this is a ARM build -STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos) -IF ( ${pos} EQUAL -1 ) - SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE ) -ELSE () - MESSAGE ( "Cross compiling for ARM7" ) - SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE ) -ENDIF () -STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos) -IF ( ${pos} EQUAL -1 ) - SET ( YUV_COMPILE_FOR_ARM7 FALSE ) -ELSE () - MESSAGE ( "Compiling for ARM" ) - SET ( YUV_COMPILE_FOR_ARM7 TRUE ) -ENDIF () - -# setup the sytem name, such as "x86-32", "amd-64", and "arm-32 -IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} ) - SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" ) -ELSE () - IF ( YUV_BIT_SIZE EQUAL 32 ) - SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" ) - ELSE () - SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" ) - ENDIF () -ENDIF () -MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" ) - -# define all the variables needed by CPack to create .deb and .rpm packages -SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" ) -SET ( CPACK_PACKAGE_CONTACT "fbarchard@chromium.org" ) -SET ( CPACK_PACKAGE_VERSION ${YUV_VERSION} ) -SET ( CPACK_PACKAGE_VERSION_MAJOR ${YUV_VER_MAJOR} ) -SET ( CPACK_PACKAGE_VERSION_MINOR ${YUV_VER_MINOR} ) -SET ( CPACK_PACKAGE_VERSION_PATCH ${YUV_VER_PATCH} ) -SET ( CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/LICENSE ) -SET ( CPACK_SYSTEM_NAME "linux-${YUV_SYSTEM_NAME}" ) -SET ( CPACK_PACKAGE_NAME "libyuv" ) -SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "YUV library" ) -SET ( CPACK_PACKAGE_DESCRIPTION "YUV library and YUV conversion tool" ) -SET ( CPACK_DEBIAN_PACKAGE_SECTION "other" ) -SET ( CPACK_DEBIAN_PACKAGE_PRIORITY "optional" ) -SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER "Frank Barchard " ) -SET ( CPACK_GENERATOR "DEB;RPM" ) - -# create the .deb and .rpm files (you'll need build-essential and rpm tools) -INCLUDE( CPack ) - diff --git a/thirdparty/libyuv/CMakeLists.txt b/thirdparty/libyuv/CMakeLists.txt deleted file mode 100644 index f25ce12..0000000 --- a/thirdparty/libyuv/CMakeLists.txt +++ /dev/null @@ -1,86 +0,0 @@ -# CMakeLists for libyuv -# Originally created for "roxlu build system" to compile libyuv on windows -# Run with -DTEST=ON to build unit tests - -PROJECT(YUV C CXX) # "C" is required even for C++ projects -CMAKE_MINIMUM_REQUIRED(VERSION 2.8) -OPTION(TEST "Built unit tests" OFF) - -SET(ly_base_dir ${PROJECT_SOURCE_DIR}) -SET(ly_src_dir ${ly_base_dir}/source) -SET(ly_inc_dir ${ly_base_dir}/include) -SET(ly_tst_dir ${ly_base_dir}/unit_test) -SET(ly_lib_name yuv) -SET(ly_lib_static ${ly_lib_name}) -SET(ly_lib_shared ${ly_lib_name}_shared) - -FILE(GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc) -LIST(SORT ly_source_files) - -FILE(GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc) -LIST(SORT ly_unittest_sources) - -INCLUDE_DIRECTORIES(BEFORE ${ly_inc_dir}) - -# this creates the static library (.a) -ADD_LIBRARY(${ly_lib_static} STATIC ${ly_source_files}) - -# this creates the shared library (.so) -ADD_LIBRARY(${ly_lib_shared} SHARED ${ly_source_files}) -SET_TARGET_PROPERTIES(${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}") -SET_TARGET_PROPERTIES(${ly_lib_shared} PROPERTIES PREFIX "lib") - -# this creates the conversion tool -# ADD_EXECUTABLE(yuvconvert ${ly_base_dir}/util/yuvconvert.cc) -# TARGET_LINK_LIBRARIES(yuvconvert ${ly_lib_static}) -# INCLUDE(FindJPEG) - -# if(JPEG_FOUND) -# include_directories(${JPEG_INCLUDE_DIR}) -# target_link_libraries(yuvconvert ${JPEG_LIBRARY}) -# add_definitions(-DHAVE_JPEG) -# endif() -if(TEST) - find_library(GTEST_LIBRARY gtest) - - if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND") - set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources") - - if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc) - message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}") - set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc) - add_library(gtest STATIC ${gtest_sources}) - include_directories(${GTEST_SRC_DIR}) - include_directories(${GTEST_SRC_DIR}/include) - set(GTEST_LIBRARY gtest) - else() - message(FATAL_ERROR "TEST is set but unable to find gtest library") - endif() - endif() - - add_executable(libyuv_unittest ${ly_unittest_sources}) - target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY}) - find_library(PTHREAD_LIBRARY pthread) - - if(NOT PTHREAD_LIBRARY STREQUAL "PTHREAD_LIBRARY-NOTFOUND") - target_link_libraries(libyuv_unittest pthread) - endif() - - if(JPEG_FOUND) - target_link_libraries(libyuv_unittest ${JPEG_LIBRARY}) - endif() - - if(NACL AND NACL_LIBC STREQUAL "newlib") - target_link_libraries(libyuv_unittest glibc-compat) - endif() -endif() - -# install the conversion tool, .so, .a, and all the header files -# INSTALL(PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin) -INSTALL(TARGETS ${ly_lib_static} DESTINATION lib) - -# INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib RUNTIME DESTINATION bin ) -INSTALL(DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include) - -# create the .deb and .rpm packages using cpack -INCLUDE(CM_linux_packages.cmake) diff --git a/thirdparty/libyuv/DIR_METADATA b/thirdparty/libyuv/DIR_METADATA deleted file mode 100644 index 8bc04f1..0000000 --- a/thirdparty/libyuv/DIR_METADATA +++ /dev/null @@ -1,3 +0,0 @@ -monorail { - component: "Internals>Images>Codecs" -} diff --git a/thirdparty/libyuv/LICENSE b/thirdparty/libyuv/LICENSE deleted file mode 100644 index c911747..0000000 --- a/thirdparty/libyuv/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -Copyright 2011 The LibYuv Project Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - * Neither the name of Google nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/thirdparty/libyuv/OWNERS b/thirdparty/libyuv/OWNERS deleted file mode 100644 index a96669f..0000000 --- a/thirdparty/libyuv/OWNERS +++ /dev/null @@ -1,10 +0,0 @@ -mbonadei@chromium.org -fbarchard@chromium.org -magjed@chromium.org -pbos@chromium.org - -per-file *.gn=mbonadei@chromium.org -per-file .gitignore=* -per-file AUTHORS=* -per-file DEPS=* -per-file PRESUBMIT.py=mbonadei@chromium.org diff --git a/thirdparty/libyuv/PATENTS b/thirdparty/libyuv/PATENTS deleted file mode 100644 index 64aa5c9..0000000 --- a/thirdparty/libyuv/PATENTS +++ /dev/null @@ -1,24 +0,0 @@ -Additional IP Rights Grant (Patents) - -"This implementation" means the copyrightable works distributed by -Google as part of the LibYuv code package. - -Google hereby grants to you a perpetual, worldwide, non-exclusive, -no-charge, irrevocable (except as stated in this section) patent -license to make, have made, use, offer to sell, sell, import, -transfer, and otherwise run, modify and propagate the contents of this -implementation of the LibYuv code package, where such license applies -only to those patent claims, both currently owned by Google and -acquired in the future, licensable by Google that are necessarily -infringed by this implementation of the LibYuv code package. This -grant does not include claims that would be infringed only as a -consequence of further modification of this implementation. If you or -your agent or exclusive licensee institute or order or agree to the -institution of patent litigation against any entity (including a -cross-claim or counterclaim in a lawsuit) alleging that this -implementation of the LibYuv code package or any code incorporated -within this implementation of the LibYuv code package constitutes -direct or contributory patent infringement, or inducement of patent -infringement, then any patent rights granted to you under this License -for this implementation of the LibYuv code package shall terminate as -of the date such litigation is filed. \ No newline at end of file diff --git a/thirdparty/libyuv/PRESUBMIT.py b/thirdparty/libyuv/PRESUBMIT.py deleted file mode 100644 index b867239..0000000 --- a/thirdparty/libyuv/PRESUBMIT.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2017 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - - -def _CommonChecks(input_api, output_api): - """Checks common to both upload and commit.""" - results = [] - results.extend(input_api.canned_checks.RunPylint(input_api, output_api, - files_to_skip=(r'^base[\\\/].*\.py$', - r'^build[\\\/].*\.py$', - r'^buildtools[\\\/].*\.py$', - r'^ios[\\\/].*\.py$', - r'^out.*[\\\/].*\.py$', - r'^testing[\\\/].*\.py$', - r'^third_party[\\\/].*\.py$', - r'^tools[\\\/].*\.py$', - # TODO(kjellander): should arguably be checked. - r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$', - r'^xcodebuild.*[\\\/].*\.py$',), - disabled_warnings=['F0401', # Failed to import x - 'E0611', # No package y in x - 'W0232', # Class has no __init__ method - ], - pylintrc='pylintrc')) - return results - - -def CheckChangeOnUpload(input_api, output_api): - results = [] - results.extend(_CommonChecks(input_api, output_api)) - results.extend( - input_api.canned_checks.CheckGNFormatted(input_api, output_api)) - return results - - -def CheckChangeOnCommit(input_api, output_api): - results = [] - results.extend(_CommonChecks(input_api, output_api)) - results.extend(input_api.canned_checks.CheckOwners(input_api, output_api)) - results.extend(input_api.canned_checks.CheckChangeWasUploaded( - input_api, output_api)) - results.extend(input_api.canned_checks.CheckChangeHasDescription( - input_api, output_api)) - return results diff --git a/thirdparty/libyuv/README.chromium b/thirdparty/libyuv/README.chromium deleted file mode 100644 index a493527..0000000 --- a/thirdparty/libyuv/README.chromium +++ /dev/null @@ -1,8 +0,0 @@ -Name: libyuv -URL: http://code.google.com/p/libyuv/ -Version: 1787 -License: BSD -License File: LICENSE - -Description: -libyuv is an open source project that includes YUV conversion and scaling functionality. diff --git a/thirdparty/libyuv/README.md b/thirdparty/libyuv/README.md deleted file mode 100644 index db70b7f..0000000 --- a/thirdparty/libyuv/README.md +++ /dev/null @@ -1,18 +0,0 @@ -**libyuv** is an open source project that includes YUV scaling and conversion functionality. - -* Scale YUV to prepare content for compression, with point, bilinear or box filter. -* Convert to YUV from webcam formats for compression. -* Convert to RGB formats for rendering/effects. -* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode. -* Optimized for SSSE3/AVX2 on x86/x64. -* Optimized for Neon on Arm. -* Optimized for MSA on Mips. - -### Development - -See [Getting started][1] for instructions on how to get started developing. - -You can also browse the [docs directory][2] for more documentation. - -[1]: ./docs/getting_started.md -[2]: ./docs/ diff --git a/thirdparty/libyuv/build_overrides/build.gni b/thirdparty/libyuv/build_overrides/build.gni deleted file mode 100644 index 473aea5..0000000 --- a/thirdparty/libyuv/build_overrides/build.gni +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2016 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# Variable that can be used to support multiple build scenarios, like having -# Chromium specific targets in a client project's GN file etc. -build_with_chromium = false - -# Some non-Chromium builds don't support building java targets. -enable_java_templates = true - -# Allow using custom suppressions files (currently not used by libyuv). -asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc" -lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc" -tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc" - -msan_blacklist_path = - rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir) -ubsan_blacklist_path = - rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir) -ubsan_vptr_blacklist_path = - rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir) - -# For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size -# limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem -# so we just ignore that assert. See https://crbug.com/648948 for more info. -ignore_elf32_limitations = true - -# Use bundled hermetic Xcode installation maintained by Chromium, -# except for local iOS builds where it is unsupported. -if (host_os == "mac") { - _result = exec_script("//build/mac/should_use_hermetic_xcode.py", - [ target_os ], - "value") - assert(_result != 2, - "Do not allow building targets with the default" + - "hermetic toolchain if the minimum OS version is not met.") - use_system_xcode = _result == 0 -} - -declare_args() { - # Tracing support requires //third_party/perfetto. - enable_base_tracing = false - use_perfetto_client_library = false - - # Allows googletest to pretty-print various absl types. - # Defined here rather than in gtest.gni to match chromium. - gtest_enable_absl_printers = true -} diff --git a/thirdparty/libyuv/build_overrides/gtest.gni b/thirdparty/libyuv/build_overrides/gtest.gni deleted file mode 100644 index d3c3f68..0000000 --- a/thirdparty/libyuv/build_overrides/gtest.gni +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# Include support for registering main function in multi-process tests. -gtest_include_multiprocess = true - -# Include support for platform-specific operations across unit tests. -gtest_include_platform_test = true - -# Exclude support for testing Objective C code on OS X and iOS. -gtest_include_objc_support = true - -# Exclude support for flushing coverage files on iOS. -gtest_include_ios_coverage = true diff --git a/thirdparty/libyuv/cleanup_links.py b/thirdparty/libyuv/cleanup_links.py deleted file mode 100644 index ba29078..0000000 --- a/thirdparty/libyuv/cleanup_links.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python -# Copyright 2017 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# This is a copy of the file from WebRTC in: -# https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py - -"""Script to cleanup symlinks created from setup_links.py. - -Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium -checkout which we created symlinks into. In order to do clean syncs after -landing that change, this script cleans up any old symlinks, avoiding annoying -manual cleanup needed in order to complete gclient sync. -""" - -import logging -import optparse -import os -import shelve -import subprocess -import sys - - -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -LINKS_DB = 'links' - -# Version management to make future upgrades/downgrades easier to support. -SCHEMA_VERSION = 1 - -class WebRTCLinkSetup(object): - def __init__(self, links_db, dry_run=False): - self._dry_run = dry_run - self._links_db = links_db - - def CleanupLinks(self): - logging.debug('CleanupLinks') - for source, link_path in self._links_db.iteritems(): - if source == 'SCHEMA_VERSION': - continue - if os.path.islink(link_path) or sys.platform.startswith('win'): - # os.path.islink() always returns false on Windows - # See http://bugs.python.org/issue13143. - logging.debug('Removing link to %s at %s', source, link_path) - if not self._dry_run: - if os.path.exists(link_path): - if sys.platform.startswith('win') and os.path.isdir(link_path): - subprocess.check_call(['rmdir', '/q', '/s', link_path], - shell=True) - else: - os.remove(link_path) - del self._links_db[source] - - -def _initialize_database(filename): - links_database = shelve.open(filename) - # Wipe the database if this version of the script ends up looking at a - # newer (future) version of the links db, just to be sure. - version = links_database.get('SCHEMA_VERSION') - if version and version != SCHEMA_VERSION: - logging.info('Found database with schema version %s while this script only ' - 'supports %s. Wiping previous database contents.', version, - SCHEMA_VERSION) - links_database.clear() - links_database['SCHEMA_VERSION'] = SCHEMA_VERSION - return links_database - - -def main(): - parser = optparse.OptionParser() - parser.add_option('-d', '--dry-run', action='store_true', default=False, - help='Print what would be done, but don\'t perform any ' - 'operations. This will automatically set logging to ' - 'verbose.') - parser.add_option('-v', '--verbose', action='store_const', - const=logging.DEBUG, default=logging.INFO, - help='Print verbose output for debugging.') - options, _ = parser.parse_args() - - if options.dry_run: - options.verbose = logging.DEBUG - logging.basicConfig(format='%(message)s', level=options.verbose) - - # Work from the root directory of the checkout. - script_dir = os.path.dirname(os.path.abspath(__file__)) - os.chdir(script_dir) - - # The database file gets .db appended on some platforms. - db_filenames = [LINKS_DB, LINKS_DB + '.db'] - if any(os.path.isfile(f) for f in db_filenames): - links_database = _initialize_database(LINKS_DB) - try: - symlink_creator = WebRTCLinkSetup(links_database, options.dry_run) - symlink_creator.CleanupLinks() - finally: - for f in db_filenames: - if os.path.isfile(f): - os.remove(f) - return 0 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/thirdparty/libyuv/codereview.settings b/thirdparty/libyuv/codereview.settings deleted file mode 100644 index b226fae..0000000 --- a/thirdparty/libyuv/codereview.settings +++ /dev/null @@ -1,5 +0,0 @@ -# This file is used by `git cl` to get repository specific information. -CODE_REVIEW_SERVER: codereview.chromium.org -GERRIT_HOST: True -PROJECT: libyuv -VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/ diff --git a/thirdparty/libyuv/download_vs_toolchain.py b/thirdparty/libyuv/download_vs_toolchain.py deleted file mode 100644 index 49d0693..0000000 --- a/thirdparty/libyuv/download_vs_toolchain.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2014 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -# This script is used to run the vs_toolchain.py script to download the -# Visual Studio toolchain. It's just a temporary measure while waiting for the -# Chrome team to move find_depot_tools into src/build to get rid of these -# workarounds (similar one in gyp_libyuv). - -import os -import sys - - -checkout_root = os.path.dirname(os.path.realpath(__file__)) -sys.path.insert(0, os.path.join(checkout_root, 'build')) -sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools')) - - -import vs_toolchain # pylint: disable=wrong-import-position - - -if __name__ == '__main__': - sys.exit(vs_toolchain.main()) diff --git a/thirdparty/libyuv/include/libyuv.h b/thirdparty/libyuv/include/libyuv.h deleted file mode 100644 index a06e123..0000000 --- a/thirdparty/libyuv/include/libyuv.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_H_ -#define INCLUDE_LIBYUV_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/compare.h" -#include "libyuv/convert.h" -#include "libyuv/convert_argb.h" -#include "libyuv/convert_from.h" -#include "libyuv/convert_from_argb.h" -#include "libyuv/cpu_id.h" -#include "libyuv/mjpeg_decoder.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate.h" -#include "libyuv/rotate_argb.h" -#include "libyuv/row.h" -#include "libyuv/scale.h" -#include "libyuv/scale_argb.h" -#include "libyuv/scale_row.h" -#include "libyuv/scale_uv.h" -#include "libyuv/version.h" -#include "libyuv/video_common.h" - -#endif // INCLUDE_LIBYUV_H_ diff --git a/thirdparty/libyuv/include/libyuv/basic_types.h b/thirdparty/libyuv/include/libyuv/basic_types.h deleted file mode 100644 index 1bea67f..0000000 --- a/thirdparty/libyuv/include/libyuv/basic_types.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ -#define INCLUDE_LIBYUV_BASIC_TYPES_H_ - -#include // For size_t and NULL - -#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) -#define INT_TYPES_DEFINED - -#if defined(_MSC_VER) && (_MSC_VER < 1600) -#include // for uintptr_t on x86 -typedef unsigned __int64 uint64_t; -typedef __int64 int64_t; -typedef unsigned int uint32_t; -typedef int int32_t; -typedef unsigned short uint16_t; -typedef short int16_t; -typedef unsigned char uint8_t; -typedef signed char int8_t; -#else -#include // for uintptr_t and C99 types -#endif // defined(_MSC_VER) && (_MSC_VER < 1600) -// Types are deprecated. Enable this macro for legacy types. -#ifdef LIBYUV_LEGACY_TYPES -typedef uint64_t uint64; -typedef int64_t int64; -typedef uint32_t uint32; -typedef int32_t int32; -typedef uint16_t uint16; -typedef int16_t int16; -typedef uint8_t uint8; -typedef int8_t int8; -#endif // LIBYUV_LEGACY_TYPES -#endif // INT_TYPES_DEFINED - -#if !defined(LIBYUV_API) -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) -#define LIBYUV_API __declspec(dllexport) -#elif defined(LIBYUV_USING_SHARED_LIBRARY) -#define LIBYUV_API __declspec(dllimport) -#else -#define LIBYUV_API -#endif // LIBYUV_BUILDING_SHARED_LIBRARY -#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ - (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ - defined(LIBYUV_USING_SHARED_LIBRARY)) -#define LIBYUV_API __attribute__((visibility("default"))) -#else -#define LIBYUV_API -#endif // __GNUC__ -#endif // LIBYUV_API - -// TODO(fbarchard): Remove bool macros. -#define LIBYUV_BOOL int -#define LIBYUV_FALSE 0 -#define LIBYUV_TRUE 1 - -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/thirdparty/libyuv/include/libyuv/compare.h b/thirdparty/libyuv/include/libyuv/compare.h deleted file mode 100644 index 3353ad7..0000000 --- a/thirdparty/libyuv/include/libyuv/compare.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_COMPARE_H_ -#define INCLUDE_LIBYUV_COMPARE_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Compute a hash for specified memory. Seed of 5381 recommended. -LIBYUV_API -uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed); - -// Hamming Distance -LIBYUV_API -uint64_t ComputeHammingDistance(const uint8_t* src_a, - const uint8_t* src_b, - int count); - -// Scan an opaque argb image and return fourcc based on alpha offset. -// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. -LIBYUV_API -uint32_t ARGBDetect(const uint8_t* argb, - int stride_argb, - int width, - int height); - -// Sum Square Error - used to compute Mean Square Error or PSNR. -LIBYUV_API -uint64_t ComputeSumSquareError(const uint8_t* src_a, - const uint8_t* src_b, - int count); - -LIBYUV_API -uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height); - -static const int kMaxPsnr = 128; - -LIBYUV_API -double SumSquareErrorToPsnr(uint64_t sse, uint64_t count); - -LIBYUV_API -double CalcFramePsnr(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height); - -LIBYUV_API -double I420Psnr(const uint8_t* src_y_a, - int stride_y_a, - const uint8_t* src_u_a, - int stride_u_a, - const uint8_t* src_v_a, - int stride_v_a, - const uint8_t* src_y_b, - int stride_y_b, - const uint8_t* src_u_b, - int stride_u_b, - const uint8_t* src_v_b, - int stride_v_b, - int width, - int height); - -LIBYUV_API -double CalcFrameSsim(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height); - -LIBYUV_API -double I420Ssim(const uint8_t* src_y_a, - int stride_y_a, - const uint8_t* src_u_a, - int stride_u_a, - const uint8_t* src_v_a, - int stride_v_a, - const uint8_t* src_y_b, - int stride_y_b, - const uint8_t* src_u_b, - int stride_u_b, - const uint8_t* src_v_b, - int stride_v_b, - int width, - int height); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_COMPARE_H_ diff --git a/thirdparty/libyuv/include/libyuv/compare_row.h b/thirdparty/libyuv/include/libyuv/compare_row.h deleted file mode 100644 index 18c5fa4..0000000 --- a/thirdparty/libyuv/include/libyuv/compare_row.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ -#define INCLUDE_LIBYUV_COMPARE_ROW_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ - _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - -// clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) -#define CLANG_HAS_AVX2 1 -#endif // clang >= 3.4 -#endif // __clang__ - -// The following are available for Visual C and GCC: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) -#define HAS_HASHDJB2_SSE41 -#define HAS_SUMSQUAREERROR_SSE2 -#define HAS_HAMMINGDISTANCE_SSE42 -#endif - -// The following are available for Visual C and clangcl 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ - defined(_MSC_VER) && !defined(__clang__) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) -#define HAS_HASHDJB2_AVX2 -#define HAS_SUMSQUAREERROR_AVX2 -#endif - -// The following are available for GCC and clangcl: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) -#define HAS_HAMMINGDISTANCE_SSSE3 -#endif - -// The following are available for GCC and clangcl: -#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ - (defined(__x86_64__) || defined(__i386__)) -#define HAS_HAMMINGDISTANCE_AVX2 -#endif - -// The following are available for Neon: -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SUMSQUAREERROR_NEON -#define HAS_HAMMINGDISTANCE_NEON -#endif - -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_HAMMINGDISTANCE_MSA -#define HAS_SUMSQUAREERROR_MSA -#endif - -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) -#define HAS_HAMMINGDISTANCE_MMI -#define HAS_SUMSQUAREERROR_MMI -#endif - -uint32_t HammingDistance_C(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_AVX2(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_C(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_AVX2(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t SumSquareError_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count); - -uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); -uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ diff --git a/thirdparty/libyuv/include/libyuv/convert.h b/thirdparty/libyuv/include/libyuv/convert.h deleted file mode 100644 index 93e7550..0000000 --- a/thirdparty/libyuv/include/libyuv/convert.h +++ /dev/null @@ -1,860 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_H_ -#define INCLUDE_LIBYUV_CONVERT_H_ - -#include "libyuv/basic_types.h" - -#include "libyuv/rotate.h" // For enum RotationMode. - -// TODO(fbarchard): fix WebRTC source to include following libyuv headers: -#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 -#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 -#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Convert I444 to I420. -LIBYUV_API -int I444ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I444 to NV12. -LIBYUV_API -int I444ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert I444 to NV21. -LIBYUV_API -int I444ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Convert I422 to I420. -LIBYUV_API -int I422ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I422 to I444. -LIBYUV_API -int I422ToI444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I422 to NV21. -LIBYUV_API -int I422ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Copy I420 to I420. -#define I420ToI420 I420Copy -LIBYUV_API -int I420Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I420 to I444. -LIBYUV_API -int I420ToI444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Copy I010 to I010 -#define I010ToI010 I010Copy -#define H010ToH010 I010Copy -LIBYUV_API -int I010Copy(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert 10 bit YUV to 8 bit -#define H010ToH420 I010ToI420 -LIBYUV_API -int I010ToI420(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define H210ToH422 I210ToI422 -LIBYUV_API -int I210ToI422(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define H410ToH444 I410ToI444 -LIBYUV_API -int I410ToI444(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define H012ToH420 I012ToI420 -LIBYUV_API -int I012ToI420(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define H212ToH422 I212ToI422 -LIBYUV_API -int I212ToI422(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define H412ToH444 I412ToI444 -LIBYUV_API -int I412ToI444(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define I412ToI012 I410ToI010 -#define H410ToH010 I410ToI010 -#define H412ToH012 I410ToI010 -LIBYUV_API -int I410ToI010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -#define I212ToI012 I210ToI010 -#define H210ToH010 I210ToI010 -#define H212ToH012 I210ToI010 -LIBYUV_API -int I210ToI010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I010 to I410 -LIBYUV_API -int I010ToI410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I012 to I412 -#define I012ToI412 I010ToI410 - -// Convert I210 to I410 -LIBYUV_API -int I210ToI410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I212 to I412 -#define I212ToI412 I210ToI410 - -// Convert I010 to P010 -LIBYUV_API -int I010ToP010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert I210 to P210 -LIBYUV_API -int I210ToP210(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert I012 to P012 -LIBYUV_API -int I012ToP012(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert I212 to P212 -LIBYUV_API -int I212ToP212(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert I400 (grey) to I420. -LIBYUV_API -int I400ToI420(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert I400 (grey) to NV21. -LIBYUV_API -int I400ToNV21(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -#define J400ToJ420 I400ToI420 - -// Convert NV12 to I420. -LIBYUV_API -int NV12ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert NV21 to I420. -LIBYUV_API -int NV21ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert NV12 to NV24. -LIBYUV_API -int NV12ToNV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert NV16 to NV24. -LIBYUV_API -int NV16ToNV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert P010 to P410. -LIBYUV_API -int P010ToP410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert P012 to P412. -#define P012ToP412 P010ToP410 - -// Convert P016 to P416. -#define P016ToP416 P010ToP410 - -// Convert P210 to P410. -LIBYUV_API -int P210ToP410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert P212 to P412. -#define P212ToP412 P210ToP410 - -// Convert P216 to P416. -#define P216ToP416 P210ToP410 - -// Convert YUY2 to I420. -LIBYUV_API -int YUY2ToI420(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert UYVY to I420. -LIBYUV_API -int UYVYToI420(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert AYUV to NV12. -LIBYUV_API -int AYUVToNV12(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert AYUV to NV21. -LIBYUV_API -int AYUVToNV21(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Convert Android420 to I420. -LIBYUV_API -int Android420ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// ARGB little endian (bgra in memory) to I420. -LIBYUV_API -int ARGBToI420(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// BGRA little endian (argb in memory) to I420. -LIBYUV_API -int BGRAToI420(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// ABGR little endian (rgba in memory) to I420. -LIBYUV_API -int ABGRToI420(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGBA little endian (abgr in memory) to I420. -LIBYUV_API -int RGBAToI420(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB little endian (bgr in memory) to I420. -LIBYUV_API -int RGB24ToI420(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB little endian (bgr in memory) to J420. -LIBYUV_API -int RGB24ToJ420(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB big endian (rgb in memory) to I420. -LIBYUV_API -int RAWToI420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB big endian (rgb in memory) to J420. -LIBYUV_API -int RAWToJ420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB16 (RGBP fourcc) little endian to I420. -LIBYUV_API -int RGB565ToI420(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB15 (RGBO fourcc) little endian to I420. -LIBYUV_API -int ARGB1555ToI420(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB12 (R444 fourcc) little endian to I420. -LIBYUV_API -int ARGB4444ToI420(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// RGB little endian (bgr in memory) to J400. -LIBYUV_API -int RGB24ToJ400(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height); - -// RGB big endian (rgb in memory) to J400. -LIBYUV_API -int RAWToJ400(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height); - -// src_width/height provided by capture. -// dst_width/height for clipping determine final size. -LIBYUV_API -int MJPGToI420(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int src_width, - int src_height, - int dst_width, - int dst_height); - -// JPEG to NV21 -LIBYUV_API -int MJPGToNV21(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int src_width, - int src_height, - int dst_width, - int dst_height); - -// JPEG to NV12 -LIBYUV_API -int MJPGToNV12(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int src_width, - int src_height, - int dst_width, - int dst_height); - -// Query size of MJPG in pixels. -LIBYUV_API -int MJPGSize(const uint8_t* sample, - size_t sample_size, - int* width, - int* height); - -// Convert camera sample to I420 with cropping, rotation and vertical flip. -// "src_size" is needed to parse MJPG. -// "dst_stride_y" number of bytes in a row of the dst_y plane. -// Normally this would be the same as dst_width, with recommended alignment -// to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. The caller should -// allocate the I420 buffer according to rotation. -// "dst_stride_u" number of bytes in a row of the dst_u plane. -// Normally this would be the same as (dst_width + 1) / 2, with -// recommended alignment to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. -// "crop_x" and "crop_y" are starting position for cropping. -// To center, crop_x = (src_width - dst_width) / 2 -// crop_y = (src_height - dst_height) / 2 -// "src_width" / "src_height" is size of src_frame in pixels. -// "src_height" can be negative indicating a vertically flipped image source. -// "crop_width" / "crop_height" is the size to crop the src to. -// Must be less than or equal to src_width/src_height -// Cropping parameters are pre-rotation. -// "rotation" can be 0, 90, 180 or 270. -// "fourcc" is a fourcc. ie 'I420', 'YUY2' -// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. -LIBYUV_API -int ConvertToI420(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/thirdparty/libyuv/include/libyuv/convert_argb.h b/thirdparty/libyuv/include/libyuv/convert_argb.h deleted file mode 100644 index eb4ebd5..0000000 --- a/thirdparty/libyuv/include/libyuv/convert_argb.h +++ /dev/null @@ -1,1974 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ -#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ - -#include "libyuv/basic_types.h" - -#include "libyuv/rotate.h" // For enum RotationMode. - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Conversion matrix for YUV to RGB -LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601 -LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // BT.601 full -LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709 -LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full -LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020 -LIBYUV_API extern const struct YuvConstants kYuvV2020Constants; // BT.2020 full - -// Conversion matrix for YVU to BGR -LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601 -LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // BT.601 full -LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709 -LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full -LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 -LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full - -// Macros for end swapped destination Matrix conversions. -// Swap UV and pass mirrored kYvuJPEGConstants matrix. -// TODO(fbarchard): Add macro for each Matrix function. -#define kYuvI601ConstantsVU kYvuI601Constants -#define kYuvJPEGConstantsVU kYvuJPEGConstants -#define kYuvH709ConstantsVU kYvuH709Constants -#define kYuvF709ConstantsVU kYvuF709Constants -#define kYuv2020ConstantsVU kYvu2020Constants -#define kYuvV2020ConstantsVU kYvuV2020Constants - -#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ - NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) -#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ - NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) -#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ - NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) -#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ - NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) -#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ - I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ - I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ - I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ - I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ - I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ - I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) -#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) -#define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) -#define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) -#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) -#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) -#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ - I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) - -// Alias. -#define ARGBToARGB ARGBCopy - -// Copy ARGB to ARGB. -LIBYUV_API -int ARGBCopy(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I420 to ARGB. -LIBYUV_API -int I420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I420 to ABGR. -LIBYUV_API -int I420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert J420 to ARGB. -LIBYUV_API -int J420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert J420 to ABGR. -LIBYUV_API -int J420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert H420 to ARGB. -LIBYUV_API -int H420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert H420 to ABGR. -LIBYUV_API -int H420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert U420 to ARGB. -LIBYUV_API -int U420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert U420 to ABGR. -LIBYUV_API -int U420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I422 to ARGB. -LIBYUV_API -int I422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I422 to ABGR. -LIBYUV_API -int I422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert J422 to ARGB. -LIBYUV_API -int J422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert J422 to ABGR. -LIBYUV_API -int J422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert H422 to ARGB. -LIBYUV_API -int H422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert H422 to ABGR. -LIBYUV_API -int H422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert U422 to ARGB. -LIBYUV_API -int U422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert U422 to ABGR. -LIBYUV_API -int U422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I444 to ARGB. -LIBYUV_API -int I444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I444 to ABGR. -LIBYUV_API -int I444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert J444 to ARGB. -LIBYUV_API -int J444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert J444 to ABGR. -LIBYUV_API -int J444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert H444 to ARGB. -LIBYUV_API -int H444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert H444 to ABGR. -LIBYUV_API -int H444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert U444 to ARGB. -LIBYUV_API -int U444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert U444 to ABGR. -LIBYUV_API -int U444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I010 to ARGB. -LIBYUV_API -int I010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I010 to ABGR. -LIBYUV_API -int I010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert H010 to ARGB. -LIBYUV_API -int H010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert H010 to ABGR. -LIBYUV_API -int H010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert U010 to ARGB. -LIBYUV_API -int U010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert U010 to ABGR. -LIBYUV_API -int U010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I210 to ARGB. -LIBYUV_API -int I210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I210 to ABGR. -LIBYUV_API -int I210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert H210 to ARGB. -LIBYUV_API -int H210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert H210 to ABGR. -LIBYUV_API -int H210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert U210 to ARGB. -LIBYUV_API -int U210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert U210 to ABGR. -LIBYUV_API -int U210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I420 with Alpha to preattenuated ARGB. -LIBYUV_API -int I420AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate); - -// Convert I420 with Alpha to preattenuated ABGR. -LIBYUV_API -int I420AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate); - -// Convert I422 with Alpha to preattenuated ARGB. -LIBYUV_API -int I422AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate); - -// Convert I422 with Alpha to preattenuated ABGR. -LIBYUV_API -int I422AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate); - -// Convert I444 with Alpha to preattenuated ARGB. -LIBYUV_API -int I444AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate); - -// Convert I444 with Alpha to preattenuated ABGR. -LIBYUV_API -int I444AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate); - -// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. -LIBYUV_API -int I400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert J400 (jpeg grey) to ARGB. -LIBYUV_API -int J400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Alias. -#define YToARGB I400ToARGB - -// Convert NV12 to ARGB. -LIBYUV_API -int NV12ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert NV21 to ARGB. -LIBYUV_API -int NV21ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert NV12 to ABGR. -LIBYUV_API -int NV12ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert NV21 to ABGR. -LIBYUV_API -int NV21ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert NV12 to RGB24. -LIBYUV_API -int NV12ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -// Convert NV21 to RGB24. -LIBYUV_API -int NV21ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -// Convert NV21 to YUV24. -LIBYUV_API -int NV21ToYUV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_yuv24, - int dst_stride_yuv24, - int width, - int height); - -// Convert NV12 to RAW. -LIBYUV_API -int NV12ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -// Convert NV21 to RAW. -LIBYUV_API -int NV21ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -// Convert YUY2 to ARGB. -LIBYUV_API -int YUY2ToARGB(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert UYVY to ARGB. -LIBYUV_API -int UYVYToARGB(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I010 to AR30. -LIBYUV_API -int I010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert H010 to AR30. -LIBYUV_API -int H010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert I010 to AB30. -LIBYUV_API -int I010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert H010 to AB30. -LIBYUV_API -int H010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert U010 to AR30. -LIBYUV_API -int U010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert U010 to AB30. -LIBYUV_API -int U010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert I210 to AR30. -LIBYUV_API -int I210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert I210 to AB30. -LIBYUV_API -int I210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert H210 to AR30. -LIBYUV_API -int H210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert H210 to AB30. -LIBYUV_API -int H210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert U210 to AR30. -LIBYUV_API -int U210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert U210 to AB30. -LIBYUV_API -int U210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// BGRA little endian (argb in memory) to ARGB. -LIBYUV_API -int BGRAToARGB(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// ABGR little endian (rgba in memory) to ARGB. -LIBYUV_API -int ABGRToARGB(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// RGBA little endian (abgr in memory) to ARGB. -LIBYUV_API -int RGBAToARGB(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Deprecated function name. -#define BG24ToARGB RGB24ToARGB - -// RGB little endian (bgr in memory) to ARGB. -LIBYUV_API -int RGB24ToARGB(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// RGB big endian (rgb in memory) to ARGB. -LIBYUV_API -int RAWToARGB(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// RGB big endian (rgb in memory) to RGBA. -LIBYUV_API -int RAWToRGBA(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); - -// RGB16 (RGBP fourcc) little endian to ARGB. -LIBYUV_API -int RGB565ToARGB(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// RGB15 (RGBO fourcc) little endian to ARGB. -LIBYUV_API -int ARGB1555ToARGB(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// RGB12 (R444 fourcc) little endian to ARGB. -LIBYUV_API -int ARGB4444ToARGB(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Aliases -#define AB30ToARGB AR30ToABGR -#define AB30ToABGR AR30ToARGB -#define AB30ToAR30 AR30ToAB30 - -// Convert AR30 To ARGB. -LIBYUV_API -int AR30ToARGB(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert AR30 To ABGR. -LIBYUV_API -int AR30ToABGR(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert AR30 To AB30. -LIBYUV_API -int AR30ToAB30(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert AR64 to ARGB. -LIBYUV_API -int AR64ToARGB(const uint16_t* src_ar64, - int src_stride_ar64, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert AB64 to ABGR. -#define AB64ToABGR AR64ToARGB - -// Convert AB64 to ARGB. -LIBYUV_API -int AB64ToARGB(const uint16_t* src_ab64, - int src_stride_ab64, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert AR64 to ABGR. -#define AR64ToABGR AB64ToARGB - -// Convert AR64 To AB64. -LIBYUV_API -int AR64ToAB64(const uint16_t* src_ar64, - int src_stride_ar64, - uint16_t* dst_ab64, - int dst_stride_ab64, - int width, - int height); - -// Convert AB64 To AR64. -#define AB64ToAR64 AR64ToAB64 - -// src_width/height provided by capture -// dst_width/height for clipping determine final size. -LIBYUV_API -int MJPGToARGB(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_argb, - int dst_stride_argb, - int src_width, - int src_height, - int dst_width, - int dst_height); - -// Convert Android420 to ARGB. -LIBYUV_API -int Android420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert Android420 to ABGR. -LIBYUV_API -int Android420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height); - -// Convert I422 to ABGR. -LIBYUV_API -int I422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); - -LIBYUV_API -int I420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -LIBYUV_API -int I420ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height); - -LIBYUV_API -int I420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -LIBYUV_API -int I420ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); - -LIBYUV_API -int I420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int I420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int H420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int H420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int J420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -LIBYUV_API -int J420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -LIBYUV_API -int I420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int J420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int H420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). -// Values in dither matrix from 0 to 7 recommended. -// The order of the dither matrix is first byte is upper left. - -LIBYUV_API -int I420ToRGB565Dither(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height); - -LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height); - -LIBYUV_API -int I420ToARGB4444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height); - -// Convert I420 to AR30. -LIBYUV_API -int I420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert I420 to AB30. -LIBYUV_API -int I420ToAB30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert H420 to AR30. -LIBYUV_API -int H420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert H420 to AB30. -LIBYUV_API -int H420ToAB30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height); - -// Convert I420 to ARGB with matrix. -LIBYUV_API -int I420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I422 to ARGB with matrix. -LIBYUV_API -int I422ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I444 to ARGB with matrix. -LIBYUV_API -int I444ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit 420 YUV to ARGB with matrix. -LIBYUV_API -int I010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit 420 YUV to ARGB with matrix. -LIBYUV_API -int I210ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit 444 YUV to ARGB with matrix. -LIBYUV_API -int I410ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit YUV to ARGB with matrix. -LIBYUV_API -int I010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// multiply 12 bit yuv into high bits to allow any number of bits. -LIBYUV_API -int I012ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 12 bit YUV to ARGB with matrix. -LIBYUV_API -int I012ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit 422 YUV to ARGB with matrix. -LIBYUV_API -int I210ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert 10 bit 444 YUV to ARGB with matrix. -LIBYUV_API -int I410ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert P010 to ARGB with matrix. -LIBYUV_API -int P010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert P210 to ARGB with matrix. -LIBYUV_API -int P210ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert P010 to AR30 with matrix. -LIBYUV_API -int P010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert P210 to AR30 with matrix. -LIBYUV_API -int P210ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// P012 and P010 use most significant bits so the conversion is the same. -// Convert P012 to ARGB with matrix. -#define P012ToARGBMatrix P010ToARGBMatrix -// Convert P012 to AR30 with matrix. -#define P012ToAR30Matrix P010ToAR30Matrix -// Convert P212 to ARGB with matrix. -#define P212ToARGBMatrix P210ToARGBMatrix -// Convert P212 to AR30 with matrix. -#define P212ToAR30Matrix P210ToAR30Matrix - -// Convert P016 to ARGB with matrix. -#define P016ToARGBMatrix P010ToARGBMatrix -// Convert P016 to AR30 with matrix. -#define P016ToAR30Matrix P010ToAR30Matrix -// Convert P216 to ARGB with matrix. -#define P216ToARGBMatrix P210ToARGBMatrix -// Convert P216 to AR30 with matrix. -#define P216ToAR30Matrix P210ToAR30Matrix - -// Convert I420 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I420AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert I422 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I422AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert I444 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I444AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert I010 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I010AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert I210 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I210AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert I410 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I410AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate); - -// Convert NV12 to ARGB with matrix. -LIBYUV_API -int NV12ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert NV21 to ARGB with matrix. -LIBYUV_API -int NV21ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert NV12 to RGB565 with matrix. -LIBYUV_API -int NV12ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert NV12 to RGB24 with matrix. -LIBYUV_API -int NV12ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert NV21 to RGB24 with matrix. -LIBYUV_API -int NV21ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert Android420 to ARGB with matrix. -LIBYUV_API -int Android420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I422 to RGBA with matrix. -LIBYUV_API -int I422ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I422 to RGBA with matrix. -LIBYUV_API -int I420ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I420 to RGB24 with matrix. -LIBYUV_API -int I420ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I420 to RGB565 with specified color matrix. -LIBYUV_API -int I420ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I420 to AR30 with matrix. -LIBYUV_API -int I420ToAR30Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. -LIBYUV_API -int I400ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height); - -// Convert camera sample to ARGB with cropping, rotation and vertical flip. -// "sample_size" is needed to parse MJPG. -// "dst_stride_argb" number of bytes in a row of the dst_argb plane. -// Normally this would be the same as dst_width, with recommended alignment -// to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. The caller should -// allocate the I420 buffer according to rotation. -// "dst_stride_u" number of bytes in a row of the dst_u plane. -// Normally this would be the same as (dst_width + 1) / 2, with -// recommended alignment to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. -// "crop_x" and "crop_y" are starting position for cropping. -// To center, crop_x = (src_width - dst_width) / 2 -// crop_y = (src_height - dst_height) / 2 -// "src_width" / "src_height" is size of src_frame in pixels. -// "src_height" can be negative indicating a vertically flipped image source. -// "crop_width" / "crop_height" is the size to crop the src to. -// Must be less than or equal to src_width/src_height -// Cropping parameters are pre-rotation. -// "rotation" can be 0, 90, 180 or 270. -// "fourcc" is a fourcc. ie 'I420', 'YUY2' -// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. -LIBYUV_API -int ConvertToARGB(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_argb, - int dst_stride_argb, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/thirdparty/libyuv/include/libyuv/convert_from.h b/thirdparty/libyuv/include/libyuv/convert_from.h deleted file mode 100644 index 32f42a6..0000000 --- a/thirdparty/libyuv/include/libyuv/convert_from.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ -#define INCLUDE_LIBYUV_CONVERT_FROM_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/rotate.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// See Also convert.h for conversions from formats to I420. - -// Convert 8 bit YUV to 10 bit. -#define H420ToH010 I420ToI010 -LIBYUV_API -int I420ToI010(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert 8 bit YUV to 12 bit. -#define H420ToH012 I420ToI012 -LIBYUV_API -int I420ToI012(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height); - -LIBYUV_API -int I420ToI422(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -LIBYUV_API -int I420ToI444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. -LIBYUV_API -int I400Copy(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -LIBYUV_API -int I420ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -LIBYUV_API -int I420ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -LIBYUV_API -int I420ToYUY2(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height); - -LIBYUV_API -int I420ToUYVY(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height); - -// The following are from convert_argb.h -// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h - -// Convert I420 to ARGB. -LIBYUV_API -int I420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I420 to ABGR. -LIBYUV_API -int I420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert I420 to specified format. -// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the -// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. -LIBYUV_API -int ConvertFromI420(const uint8_t* y, - int y_stride, - const uint8_t* u, - int u_stride, - const uint8_t* v, - int v_stride, - uint8_t* dst_sample, - int dst_sample_stride, - int width, - int height, - uint32_t fourcc); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ diff --git a/thirdparty/libyuv/include/libyuv/convert_from_argb.h b/thirdparty/libyuv/include/libyuv/convert_from_argb.h deleted file mode 100644 index bf48786..0000000 --- a/thirdparty/libyuv/include/libyuv/convert_from_argb.h +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ -#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Copy ARGB to ARGB. -#define ARGBToARGB ARGBCopy -LIBYUV_API -int ARGBCopy(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert ARGB To BGRA. -LIBYUV_API -int ARGBToBGRA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height); - -// Convert ARGB To ABGR. -LIBYUV_API -int ARGBToABGR(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - -// Convert ARGB To RGBA. -LIBYUV_API -int ARGBToRGBA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height); - -// Aliases -#define ARGBToAB30 ABGRToAR30 -#define ABGRToAB30 ARGBToAR30 - -// Convert ABGR To AR30. -LIBYUV_API -int ABGRToAR30(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Convert ARGB To AR30. -LIBYUV_API -int ARGBToAR30(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height); - -// Aliases -#define ABGRToRGB24 ARGBToRAW -#define ABGRToRAW ARGBToRGB24 - -// Convert ARGB To RGB24. -LIBYUV_API -int ARGBToRGB24(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -// Convert ARGB To RAW. -LIBYUV_API -int ARGBToRAW(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height); - -// Convert ARGB To RGB565. -LIBYUV_API -int ARGBToRGB565(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height); - -// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). -// Values in dither matrix from 0 to 7 recommended. -// The order of the dither matrix is first byte is upper left. -// TODO(fbarchard): Consider pointer to 2d array for dither4x4. -// const uint8_t(*dither)[4][4]; -LIBYUV_API -int ARGBToRGB565Dither(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height); - -// Convert ARGB To ARGB1555. -LIBYUV_API -int ARGBToARGB1555(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height); - -// Convert ARGB To ARGB4444. -LIBYUV_API -int ARGBToARGB4444(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height); - -// Convert ARGB To I444. -LIBYUV_API -int ARGBToI444(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert ARGB to AR64. -LIBYUV_API -int ARGBToAR64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height); - -// Convert ABGR to AB64. -#define ABGRToAB64 ARGBToAR64 - -// Convert ARGB to AB64. -LIBYUV_API -int ARGBToAB64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ab64, - int dst_stride_ab64, - int width, - int height); - -// Convert ABGR to AR64. -#define ABGRToAR64 ARGBToAB64 - -// Convert ARGB To I422. -LIBYUV_API -int ARGBToI422(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert ARGB To I420. (also in convert.h) -LIBYUV_API -int ARGBToI420(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert ARGB to J420. (JPeg full range I420). -LIBYUV_API -int ARGBToJ420(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert ARGB to J422. -LIBYUV_API -int ARGBToJ422(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert ARGB to J400. (JPeg full range). -LIBYUV_API -int ARGBToJ400(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height); - -// Convert RGBA to J400. (JPeg full range). -LIBYUV_API -int RGBAToJ400(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height); - -// Convert ARGB to I400. -LIBYUV_API -int ARGBToI400(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) -LIBYUV_API -int ARGBToG(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_g, - int dst_stride_g, - int width, - int height); - -// Convert ARGB To NV12. -LIBYUV_API -int ARGBToNV12(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert ARGB To NV21. -LIBYUV_API -int ARGBToNV21(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Convert ABGR To NV12. -LIBYUV_API -int ABGRToNV12(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert ABGR To NV21. -LIBYUV_API -int ABGRToNV21(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Convert ARGB To YUY2. -LIBYUV_API -int ARGBToYUY2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height); - -// Convert ARGB To UYVY. -LIBYUV_API -int ARGBToUYVY(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ diff --git a/thirdparty/libyuv/include/libyuv/cpu_id.h b/thirdparty/libyuv/include/libyuv/cpu_id.h deleted file mode 100644 index 3e27cc1..0000000 --- a/thirdparty/libyuv/include/libyuv/cpu_id.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ -#define INCLUDE_LIBYUV_CPU_ID_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Internal flag to indicate cpuid requires initialization. -static const int kCpuInitialized = 0x1; - -// These flags are only valid on ARM processors. -static const int kCpuHasARM = 0x2; -static const int kCpuHasNEON = 0x4; -// 0x8 reserved for future ARM flag. - -// These flags are only valid on x86 processors. -static const int kCpuHasX86 = 0x10; -static const int kCpuHasSSE2 = 0x20; -static const int kCpuHasSSSE3 = 0x40; -static const int kCpuHasSSE41 = 0x80; -static const int kCpuHasSSE42 = 0x100; // unused at this time. -static const int kCpuHasAVX = 0x200; -static const int kCpuHasAVX2 = 0x400; -static const int kCpuHasERMS = 0x800; -static const int kCpuHasFMA3 = 0x1000; -static const int kCpuHasF16C = 0x2000; -static const int kCpuHasGFNI = 0x4000; -static const int kCpuHasAVX512BW = 0x8000; -static const int kCpuHasAVX512VL = 0x10000; -static const int kCpuHasAVX512VBMI = 0x20000; -static const int kCpuHasAVX512VBMI2 = 0x40000; -static const int kCpuHasAVX512VBITALG = 0x80000; -static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; - -// These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x200000; -static const int kCpuHasMSA = 0x400000; -static const int kCpuHasMMI = 0x800000; - -// Optional init function. TestCpuFlag does an auto-init. -// Returns cpu_info flags. -LIBYUV_API -int InitCpuFlags(void); - -// Detect CPU has SSE2 etc. -// Test_flag parameter should be one of kCpuHas constants above. -// Returns non-zero if instruction set is detected -static __inline int TestCpuFlag(int test_flag) { - LIBYUV_API extern int cpu_info_; -#ifdef __ATOMIC_RELAXED - int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); -#else - int cpu_info = cpu_info_; -#endif - return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; -} - -// Internal function for parsing /proc/cpuinfo. -LIBYUV_API -int ArmCpuCaps(const char* cpuinfo_name); -LIBYUV_API -int MipsCpuCaps(const char* cpuinfo_name); - -// For testing, allow CPU flags to be disabled. -// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -// MaskCpuFlags(-1) to enable all cpu specific optimizations. -// MaskCpuFlags(1) to disable all cpu specific optimizations. -// MaskCpuFlags(0) to reset state so next call will auto init. -// Returns cpu_info flags. -LIBYUV_API -int MaskCpuFlags(int enable_flags); - -// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| -// should be a valid combination of the kCpuHas constants above and include -// kCpuInitialized. Use this method when running in a sandboxed process where -// the detection code might fail (as it might access /proc/cpuinfo). In such -// cases the cpu_info can be obtained from a non sandboxed process by calling -// InitCpuFlags() and passed to the sandboxed process (via command line -// parameters, IPC...) which can then call this method to initialize the CPU -// flags. -// Notes: -// - when specifying 0 for |cpu_flags|, the auto initialization is enabled -// again. -// - enabling CPU features that are not supported by the CPU will result in -// undefined behavior. -// TODO(fbarchard): consider writing a helper function that translates from -// other library CPU info to libyuv CPU info and add a .md doc that explains -// CPU detection. -static __inline void SetCpuFlags(int cpu_flags) { - LIBYUV_API extern int cpu_info_; -#ifdef __ATOMIC_RELAXED - __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); -#else - cpu_info_ = cpu_flags; -#endif -} - -// Low level cpuid for X86. Returns zeros on other CPUs. -// eax is the info type that you want. -// ecx is typically the cpu number, and should normally be zero. -LIBYUV_API -void CpuId(int info_eax, int info_ecx, int* cpu_info); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/thirdparty/libyuv/include/libyuv/macros_msa.h b/thirdparty/libyuv/include/libyuv/macros_msa.h deleted file mode 100644 index 4e232b6..0000000 --- a/thirdparty/libyuv/include/libyuv/macros_msa.h +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ -#define INCLUDE_LIBYUV_MACROS_MSA_H_ - -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include -#include - -#if (__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ - uint32_t val_m; \ - asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_lw_m] "m"(*psrc_lw_m)); \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint64_t val_m = 0; \ - asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_ld_m] "m"(*psrc_ld_m)); \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - val0_m = LW(psrc_ld_m); \ - val1_m = LW(psrc_ld_m + 4); \ - val_m = (uint64_t)(val1_m); /* NOLINT */ \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SW(val, pdst) \ - ({ \ - uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val_m = (val); \ - asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ - }) - -#if (__mips == 64) -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint64_t val_m = (val); \ - asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ - : [pdst_sd_m] "=m"(*pdst_sd_m) \ - : [val_m] "r"(val_m)); \ - }) -#else // !(__mips == 64) -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val0_m, val1_m; \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - SW(val0_m, pdst_sd_m); \ - SW(val1_m, pdst_sd_m + 4); \ - }) -#endif // !(__mips == 64) -#else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ - uint32_t val_m; \ - asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_lw_m] "m"(*psrc_lw_m)); \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint64_t val_m = 0; \ - asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_ld_m] "m"(*psrc_ld_m)); \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - val0_m = LW(psrc_ld_m); \ - val1_m = LW(psrc_ld_m + 4); \ - val_m = (uint64_t)(val1_m); /* NOLINT */ \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SW(val, pdst) \ - ({ \ - uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val_m = (val); \ - asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ - }) - -#define SD(val, pdst) \ - ({ \ - uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ - uint32_t val0_m, val1_m; \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - SW(val0_m, pdst_sd_m); \ - SW(val1_m, pdst_sd_m + 4); \ - }) -#endif // (__mips_isa_rev >= 6) - -// TODO(fbarchard): Consider removing __VAR_ARGS versions. -#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ -#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ -#define LD_UH(...) LD_H(const v8u16, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ -#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) - -/* Description : Load two vectors with 16 'byte' sized elements - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Load 16 byte elements in 'out0' from (psrc) - Load 16 byte elements in 'out1' from (psrc + stride) -*/ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ - } -#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) - -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) - -/* Description : Store two vectors with stride each having 16 'byte' sized - elements - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 16 byte elements from 'in0' to (pdst) - Store 16 byte elements from 'in1' to (pdst + stride) -*/ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) - -// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. -/* Description : Shuffle byte vector elements as per mask vector - Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0' & 'in1' are copied selectively to - 'out0' as per control vector 'mask0' -*/ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ - } -#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) - -/* Description : Interleave both left and right half of input vectors - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and written to 'out0' -*/ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - } -#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) - -#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ - -#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h b/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h deleted file mode 100644 index 275f8d4..0000000 --- a/thirdparty/libyuv/include/libyuv/mjpeg_decoder.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ -#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -// NOTE: For a simplified public API use convert.h MJPGToI420(). - -struct jpeg_common_struct; -struct jpeg_decompress_struct; -struct jpeg_source_mgr; - -namespace libyuv { - -#ifdef __cplusplus -extern "C" { -#endif - -LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); - -#ifdef __cplusplus -} // extern "C" -#endif - -static const uint32_t kUnknownDataSize = 0xFFFFFFFF; - -enum JpegSubsamplingType { - kJpegYuv420, - kJpegYuv422, - kJpegYuv444, - kJpegYuv400, - kJpegUnknown -}; - -struct Buffer { - const uint8_t* data; - int len; -}; - -struct BufferVector { - Buffer* buffers; - int len; - int pos; -}; - -struct SetJmpErrorMgr; - -// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are -// simply independent JPEG images with a fixed huffman table (which is omitted). -// It is rarely used in video transmission, but is common as a camera capture -// format, especially in Logitech devices. This class implements a decoder for -// MJPEG frames. -// -// See http://tools.ietf.org/html/rfc2435 -class LIBYUV_API MJpegDecoder { - public: - typedef void (*CallbackFunction)(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows); - - static const int kColorSpaceUnknown; - static const int kColorSpaceGrayscale; - static const int kColorSpaceRgb; - static const int kColorSpaceYCbCr; - static const int kColorSpaceCMYK; - static const int kColorSpaceYCCK; - - MJpegDecoder(); - ~MJpegDecoder(); - - // Loads a new frame, reads its headers, and determines the uncompressed - // image format. - // Returns LIBYUV_TRUE if image looks valid and format is supported. - // If return value is LIBYUV_TRUE, then the values for all the following - // getters are populated. - // src_len is the size of the compressed mjpeg frame in bytes. - LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len); - - // Returns width of the last loaded frame in pixels. - int GetWidth(); - - // Returns height of the last loaded frame in pixels. - int GetHeight(); - - // Returns format of the last loaded frame. The return value is one of the - // kColorSpace* constants. - int GetColorSpace(); - - // Number of color components in the color space. - int GetNumComponents(); - - // Sample factors of the n-th component. - int GetHorizSampFactor(int component); - - int GetVertSampFactor(int component); - - int GetHorizSubSampFactor(int component); - - int GetVertSubSampFactor(int component); - - // Public for testability. - int GetImageScanlinesPerImcuRow(); - - // Public for testability. - int GetComponentScanlinesPerImcuRow(int component); - - // Width of a component in bytes. - int GetComponentWidth(int component); - - // Height of a component. - int GetComponentHeight(int component); - - // Width of a component in bytes with padding for DCTSIZE. Public for testing. - int GetComponentStride(int component); - - // Size of a component in bytes. - int GetComponentSize(int component); - - // Call this after LoadFrame() if you decide you don't want to decode it - // after all. - LIBYUV_BOOL UnloadFrame(); - - // Decodes the entire image into a one-buffer-per-color-component format. - // dst_width must match exactly. dst_height must be <= to image height; if - // less, the image is cropped. "planes" must have size equal to at least - // GetNumComponents() and they must point to non-overlapping buffers of size - // at least GetComponentSize(i). The pointers in planes are incremented - // to point to after the end of the written data. - // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height); - - // Decodes the entire image and passes the data via repeated calls to a - // callback function. Each call will get the data for a whole number of - // image scanlines. - // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, - void* opaque, - int dst_width, - int dst_height); - - // The helper function which recognizes the jpeg sub-sampling type. - static JpegSubsamplingType JpegSubsamplingTypeHelper( - int* subsample_x, - int* subsample_y, - int number_of_components); - - private: - void AllocOutputBuffers(int num_outbufs); - void DestroyOutputBuffers(); - - LIBYUV_BOOL StartDecode(); - LIBYUV_BOOL FinishDecode(); - - void SetScanlinePointers(uint8_t** data); - LIBYUV_BOOL DecodeImcuRow(); - - int GetComponentScanlinePadding(int component); - - // A buffer holding the input data for a frame. - Buffer buf_; - BufferVector buf_vec_; - - jpeg_decompress_struct* decompress_struct_; - jpeg_source_mgr* source_mgr_; - SetJmpErrorMgr* error_mgr_; - - // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., - // GetComponentScanlinePadding() != 0.) - LIBYUV_BOOL has_scanline_padding_; - - // Temporaries used to point to scanline outputs. - int num_outbufs_; // Outermost size of all arrays below. - uint8_t*** scanlines_; - int* scanlines_sizes_; - // Temporary buffer used for decoding when we can't decode directly to the - // output buffers. Large enough for just one iMCU row. - uint8_t** databuf_; - int* databuf_strides_; -}; - -} // namespace libyuv - -#endif // __cplusplus -#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ diff --git a/thirdparty/libyuv/include/libyuv/planar_functions.h b/thirdparty/libyuv/include/libyuv/planar_functions.h deleted file mode 100644 index fdecdee..0000000 --- a/thirdparty/libyuv/include/libyuv/planar_functions.h +++ /dev/null @@ -1,1055 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ -#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ - -#include "libyuv/basic_types.h" - -// TODO(fbarchard): Remove the following headers includes. -#include "libyuv/convert.h" -#include "libyuv/convert_argb.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// TODO(fbarchard): Move cpu macros to row.h -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_ARGBAFFINEROW_SSE2 -#endif - -// Copy a plane of data. -LIBYUV_API -void CopyPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -LIBYUV_API -void CopyPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height); - -LIBYUV_API -void Convert16To8Plane(const uint16_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int scale, // 16384 for 10 bits - int width, - int height); - -LIBYUV_API -void Convert8To16Plane(const uint8_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int scale, // 1024 for 10 bits - int width, - int height); - -// Set a plane of data to a 32 bit value. -LIBYUV_API -void SetPlane(uint8_t* dst_y, - int dst_stride_y, - int width, - int height, - uint32_t value); - -// Split interleaved UV plane into separate U and V planes. -LIBYUV_API -void SplitUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Merge separate U and V planes into one interleaved UV plane. -LIBYUV_API -void MergeUVPlane(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Split interleaved msb UV plane into separate lsb U and V planes. -LIBYUV_API -void SplitUVPlane_16(const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height, - int depth); - -// Merge separate lsb U and V planes into one interleaved msb UV plane. -LIBYUV_API -void MergeUVPlane_16(const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height, - int depth); - -// Convert lsb plane to msb plane -LIBYUV_API -void ConvertToMSBPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height, - int depth); - -// Convert msb plane to lsb plane -LIBYUV_API -void ConvertToLSBPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height, - int depth); - -// Scale U and V to half width and height and merge into interleaved UV plane. -// width and height are source size, allowing odd sizes. -// Use for converting I444 or I422 to NV12. -LIBYUV_API -void HalfMergeUVPlane(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Swap U and V channels in interleaved UV plane. -LIBYUV_API -void SwapUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Split interleaved RGB plane into separate R, G and B planes. -LIBYUV_API -void SplitRGBPlane(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -// Merge separate R, G and B planes into one interleaved RGB plane. -LIBYUV_API -void MergeRGBPlane(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - uint8_t* dst_rgb, - int dst_stride_rgb, - int width, - int height); - -// Split interleaved ARGB plane into separate R, G, B and A planes. -// dst_a can be NULL to discard alpha plane. -LIBYUV_API -void SplitARGBPlane(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height); - -// Merge separate R, G, B and A planes into one interleaved ARGB plane. -// src_a can be NULL to fill opaque value to alpha. -LIBYUV_API -void MergeARGBPlane(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Merge separate 'depth' bit R, G and B planes stored in lsb -// into one interleaved XR30 plane. -// depth should in range [10, 16] -LIBYUV_API -void MergeXR30Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height, - int depth); - -// Merge separate 'depth' bit R, G, B and A planes stored in lsb -// into one interleaved AR64 plane. -// src_a can be NULL to fill opaque value to alpha. -// depth should in range [1, 16] -LIBYUV_API -void MergeAR64Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height, - int depth); - -// Merge separate 'depth' bit R, G, B and A planes stored in lsb -// into one interleaved ARGB plane. -// src_a can be NULL to fill opaque value to alpha. -// depth should in range [8, 16] -LIBYUV_API -void MergeARGB16To8Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int depth); - -// Copy I400. Supports inverting. -LIBYUV_API -int I400ToI400(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -#define J400ToJ400 I400ToI400 - -// Copy I422 to I422. -#define I422ToI422 I422Copy -LIBYUV_API -int I422Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Copy I444 to I444. -#define I444ToI444 I444Copy -LIBYUV_API -int I444Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Copy NV12. Supports inverting. -int NV12Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Copy NV21. Supports inverting. -int NV21Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// Convert YUY2 to I422. -LIBYUV_API -int YUY2ToI422(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Convert UYVY to I422. -LIBYUV_API -int UYVYToI422(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -LIBYUV_API -int YUY2ToNV12(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -LIBYUV_API -int UYVYToNV12(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Convert NV21 to NV12. -LIBYUV_API -int NV21ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -LIBYUV_API -int YUY2ToY(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Convert I420 to I400. (calls CopyPlane ignoring u/v). -LIBYUV_API -int I420ToI400(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Alias -#define J420ToJ400 I420ToI400 -#define I420ToI420Mirror I420Mirror - -// I420 mirror. -LIBYUV_API -int I420Mirror(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Alias -#define I400ToI400Mirror I400Mirror - -// I400 mirror. A single plane is mirrored horizontally. -// Pass negative height to achieve 180 degree rotation. -LIBYUV_API -int I400Mirror(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Alias -#define NV12ToNV12Mirror NV12Mirror - -// NV12 mirror. -LIBYUV_API -int NV12Mirror(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Alias -#define ARGBToARGBMirror ARGBMirror - -// ARGB mirror. -LIBYUV_API -int ARGBMirror(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Alias -#define RGB24ToRGB24Mirror RGB24Mirror - -// RGB24 mirror. -LIBYUV_API -int RGB24Mirror(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -// Mirror a plane of data. -LIBYUV_API -void MirrorPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Mirror a plane of UV data. -LIBYUV_API -void MirrorUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - -// Alias -#define RGB24ToRAW RAWToRGB24 - -LIBYUV_API -int RAWToRGB24(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); - -// Draw a rectangle into I420. -LIBYUV_API -int I420Rect(uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int x, - int y, - int width, - int height, - int value_y, - int value_u, - int value_v); - -// Draw a rectangle into ARGB. -LIBYUV_API -int ARGBRect(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height, - uint32_t value); - -// Convert ARGB to gray scale ARGB. -LIBYUV_API -int ARGBGrayTo(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Make a rectangle of ARGB gray scale. -LIBYUV_API -int ARGBGray(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height); - -// Make a rectangle of ARGB Sepia tone. -LIBYUV_API -int ARGBSepia(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height); - -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The next 4 coefficients apply to B, G, R, A and produce R of the output. -// The last 4 coefficients apply to B, G, R, A and produce A of the output. -LIBYUV_API -int ARGBColorMatrix(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const int8_t* matrix_argb, - int width, - int height); - -// Deprecated. Use ARGBColorMatrix instead. -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The last 4 coefficients apply to B, G, R, A and produce R of the output. -LIBYUV_API -int RGBColorMatrix(uint8_t* dst_argb, - int dst_stride_argb, - const int8_t* matrix_rgb, - int dst_x, - int dst_y, - int width, - int height); - -// Apply a color table each ARGB pixel. -// Table contains 256 ARGB values. -LIBYUV_API -int ARGBColorTable(uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* table_argb, - int dst_x, - int dst_y, - int width, - int height); - -// Apply a color table each ARGB pixel but preserve destination alpha. -// Table contains 256 ARGB values. -LIBYUV_API -int RGBColorTable(uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* table_argb, - int dst_x, - int dst_y, - int width, - int height); - -// Apply a luma/color table each ARGB pixel but preserve destination alpha. -// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from -// RGB (YJ style) and C is an 8 bit color component (R, G or B). -LIBYUV_API -int ARGBLumaColorTable(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* luma, - int width, - int height); - -// Apply a 3 term polynomial to ARGB values. -// poly points to a 4x4 matrix. The first row is constants. The 2nd row is -// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, -// g squared, r squared and a squared. The 4rd row is coefficients for b to -// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and -// result clamped to 0 to 255. -// A polynomial approximation can be dirived using software such as 'R'. - -LIBYUV_API -int ARGBPolynomial(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const float* poly, - int width, - int height); - -// Convert plane of 16 bit shorts to half floats. -// Source values are multiplied by scale before storing as half float. -LIBYUV_API -int HalfFloatPlane(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - float scale, - int width, - int height); - -// Convert a buffer of bytes to floats, scale the values and store as floats. -LIBYUV_API -int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); - -// Quantize a rectangle of ARGB. Alpha unaffected. -// scale is a 16 bit fractional fixed point scaler between 0 and 65535. -// interval_size should be a value between 1 and 255. -// interval_offset should be a value between 0 and 255. -LIBYUV_API -int ARGBQuantize(uint8_t* dst_argb, - int dst_stride_argb, - int scale, - int interval_size, - int interval_offset, - int dst_x, - int dst_y, - int width, - int height); - -// Copy ARGB to ARGB. -LIBYUV_API -int ARGBCopy(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Copy Alpha channel of ARGB to alpha of ARGB. -LIBYUV_API -int ARGBCopyAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Extract the alpha channel from ARGB. -LIBYUV_API -int ARGBExtractAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height); - -// Copy Y channel to Alpha of ARGB. -LIBYUV_API -int ARGBCopyYToAlpha(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); - -// Get function to Alpha Blend ARGB pixels and store to destination. -LIBYUV_API -ARGBBlendRow GetARGBBlend(); - -// Alpha Blend ARGB images and store to destination. -// Source is pre-multiplied by alpha using ARGBAttenuate. -// Alpha of destination is set to 255. -LIBYUV_API -int ARGBBlend(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Alpha Blend plane and store to destination. -// Source is not pre-multiplied by alpha. -LIBYUV_API -int BlendPlane(const uint8_t* src_y0, - int src_stride_y0, - const uint8_t* src_y1, - int src_stride_y1, - const uint8_t* alpha, - int alpha_stride, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Alpha Blend YUV images and store to destination. -// Source is not pre-multiplied by alpha. -// Alpha is full width x height and subsampled to half size to apply to UV. -LIBYUV_API -int I420Blend(const uint8_t* src_y0, - int src_stride_y0, - const uint8_t* src_u0, - int src_stride_u0, - const uint8_t* src_v0, - int src_stride_v0, - const uint8_t* src_y1, - int src_stride_y1, - const uint8_t* src_u1, - int src_stride_u1, - const uint8_t* src_v1, - int src_stride_v1, - const uint8_t* alpha, - int alpha_stride, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); - -// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. -LIBYUV_API -int ARGBMultiply(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Add ARGB image with ARGB image. Saturates to 255. -LIBYUV_API -int ARGBAdd(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. -LIBYUV_API -int ARGBSubtract(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert I422 to YUY2. -LIBYUV_API -int I422ToYUY2(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height); - -// Convert I422 to UYVY. -LIBYUV_API -int I422ToUYVY(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height); - -// Convert unattentuated ARGB to preattenuated ARGB. -LIBYUV_API -int ARGBAttenuate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert preattentuated ARGB to unattenuated ARGB. -LIBYUV_API -int ARGBUnattenuate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Internal function - do not call directly. -// Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. -LIBYUV_API -int ARGBComputeCumulativeSum(const uint8_t* src_argb, - int src_stride_argb, - int32_t* dst_cumsum, - int dst_stride32_cumsum, - int width, - int height); - -// Blur ARGB image. -// dst_cumsum table of width * (height + 1) * 16 bytes aligned to -// 16 byte boundary. -// dst_stride32_cumsum is number of ints in a row (width * 4). -// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. -// Blur is optimized for radius of 5 (11x11) or less. -LIBYUV_API -int ARGBBlur(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int32_t* dst_cumsum, - int dst_stride32_cumsum, - int width, - int height, - int radius); - -// Gaussian 5x5 blur a float plane. -// Coefficients of 1, 4, 6, 4, 1. -// Each destination pixel is a blur of the 5x5 -// pixels from the source. -// Source edges are clamped. -LIBYUV_API -int GaussPlane_F32(const float* src, - int src_stride, - float* dst, - int dst_stride, - int width, - int height); - -// Multiply ARGB image by ARGB value. -LIBYUV_API -int ARGBShade(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - uint32_t value); - -// Interpolate between two images using specified amount of interpolation -// (0 to 255) and store to destination. -// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 -// and 255 means 1% src0 and 99% src1. -LIBYUV_API -int InterpolatePlane(const uint8_t* src0, - int src_stride0, - const uint8_t* src1, - int src_stride1, - uint8_t* dst, - int dst_stride, - int width, - int height, - int interpolation); - -// Interpolate between two ARGB images using specified amount of interpolation -// Internally calls InterpolatePlane with width * 4 (bpp). -LIBYUV_API -int ARGBInterpolate(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int interpolation); - -// Interpolate between two YUV images using specified amount of interpolation -// Internally calls InterpolatePlane on each plane where the U and V planes -// are half width and half height. -LIBYUV_API -int I420Interpolate(const uint8_t* src0_y, - int src0_stride_y, - const uint8_t* src0_u, - int src0_stride_u, - const uint8_t* src0_v, - int src0_stride_v, - const uint8_t* src1_y, - int src1_stride_y, - const uint8_t* src1_u, - int src1_stride_u, - const uint8_t* src1_v, - int src1_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - int interpolation); - -// Row function for copying pixels from a source with a slope to a row -// of destination. Useful for scaling, rotation, mirror, texture mapping. -LIBYUV_API -void ARGBAffineRow_C(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width); -// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width); - -// Shuffle ARGB channel order. e.g. BGRA to ARGB. -// shuffler is 16 bytes. -LIBYUV_API -int ARGBShuffle(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* shuffler, - int width, - int height); - -// Shuffle AR64 channel order. e.g. AR64 to AB64. -// shuffler is 16 bytes. -LIBYUV_API -int AR64Shuffle(const uint16_t* src_ar64, - int src_stride_ar64, - uint16_t* dst_ar64, - int dst_stride_ar64, - const uint8_t* shuffler, - int width, - int height); - -// Sobel ARGB effect with planar output. -LIBYUV_API -int ARGBSobelToPlane(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height); - -// Sobel ARGB effect. -LIBYUV_API -int ARGBSobel(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. -LIBYUV_API -int ARGBSobelXY(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/thirdparty/libyuv/include/libyuv/rotate.h b/thirdparty/libyuv/include/libyuv/rotate.h deleted file mode 100644 index 3088822..0000000 --- a/thirdparty/libyuv/include/libyuv/rotate.h +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROTATE_H_ -#define INCLUDE_LIBYUV_ROTATE_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Supported rotation. -typedef enum RotationMode { - kRotate0 = 0, // No rotation. - kRotate90 = 90, // Rotate 90 degrees clockwise. - kRotate180 = 180, // Rotate 180 degrees. - kRotate270 = 270, // Rotate 270 degrees clockwise. - - // Deprecated. - kRotateNone = 0, - kRotateClockwise = 90, - kRotateCounterClockwise = 270, -} RotationModeEnum; - -// Rotate I420 frame. -LIBYUV_API -int I420Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode); - -// Rotate I444 frame. -LIBYUV_API -int I444Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode); - -// Rotate NV12 input and store in I420. -LIBYUV_API -int NV12ToI420Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode); - -// Rotate a plane by 0, 90, 180, or 270. -LIBYUV_API -int RotatePlane(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height, - enum RotationMode mode); - -// Rotate planes by 90, 180, 270. Deprecated. -LIBYUV_API -void RotatePlane90(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height); - -LIBYUV_API -void RotatePlane180(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height); - -LIBYUV_API -void RotatePlane270(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height); - -// Rotations for when U and V are interleaved. -// These functions take one input pointer and -// split the data into two buffers while -// rotating them. Deprecated. -LIBYUV_API -void RotateUV90(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -LIBYUV_API -void RotateUV180(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -LIBYUV_API -void RotateUV270(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -// The 90 and 270 functions are based on transposes. -// Doing a transpose with reversing the read/write -// order will result in a rotation by +- 90 degrees. -// Deprecated. -LIBYUV_API -void TransposePlane(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height); - -LIBYUV_API -void TransposeUV(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/thirdparty/libyuv/include/libyuv/rotate_argb.h b/thirdparty/libyuv/include/libyuv/rotate_argb.h deleted file mode 100644 index 2043294..0000000 --- a/thirdparty/libyuv/include/libyuv/rotate_argb.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ -#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/rotate.h" // For RotationMode. - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Rotate ARGB frame -LIBYUV_API -int ARGBRotate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int src_width, - int src_height, - enum RotationMode mode); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ diff --git a/thirdparty/libyuv/include/libyuv/rotate_row.h b/thirdparty/libyuv/include/libyuv/rotate_row.h deleted file mode 100644 index 5a9cf93..0000000 --- a/thirdparty/libyuv/include/libyuv/rotate_row.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ -#define INCLUDE_LIBYUV_ROTATE_ROW_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// The following are available for Visual C 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ - !defined(__clang__) -#define HAS_TRANSPOSEWX8_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif - -// The following are available for GCC 32 or 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) -#define HAS_TRANSPOSEWX8_SSSE3 -#endif - -// The following are available for 64 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) -#define HAS_TRANSPOSEWX8_FAST_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_TRANSPOSEWX8_NEON -#define HAS_TRANSPOSEUVWX8_NEON -#endif - -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_TRANSPOSEWX16_MSA -#define HAS_TRANSPOSEUVWX16_MSA -#endif - -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) -#define HAS_TRANSPOSEWX8_MMI -#define HAS_TRANSPOSEUVWX8_MMI -#endif - -void TransposeWxH_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height); - -void TransposeWx8_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); - -void TransposeWx8_Any_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Any_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx16_Any_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); - -void TransposeUVWxH_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height); - -void TransposeUVWx8_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); - -void TransposeUVWx8_Any_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx8_Any_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx8_Any_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); -void TransposeUVWx16_Any_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/thirdparty/libyuv/include/libyuv/row.h b/thirdparty/libyuv/include/libyuv/row.h deleted file mode 100644 index 6c3f81e..0000000 --- a/thirdparty/libyuv/include/libyuv/row.h +++ /dev/null @@ -1,5274 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROW_H_ -#define INCLUDE_LIBYUV_ROW_H_ - -#include // For malloc. - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// clang >= 3.5.0 required for Arm64. -#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) -#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) -#define LIBYUV_DISABLE_NEON -#endif // clang >= 3.5 -#endif // __clang__ - -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) -#define CLANG_HAS_AVX2 1 -#endif // clang >= 3.4 -#endif // __clang__ - -// clang >= 6.0.0 required for AVX512. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -// clang in xcode follows a different versioning scheme. -// TODO(fbarchard): fix xcode 9 ios b/789. -#if (__clang_major__ >= 7) && !defined(__APPLE__) -#define CLANG_HAS_AVX512 1 -#endif // clang >= 7 -#endif // __clang__ - -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ - _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -// Conversions: -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGB1555TOARGBROW_SSE2 -#define HAS_ARGB4444TOARGBROW_SSE2 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 -#define HAS_ARGBSETROW_X86 -#define HAS_ARGBSHUFFLEROW_SSSE3 -#define HAS_ARGBTOARGB1555ROW_SSE2 -#define HAS_ARGBTOARGB4444ROW_SSE2 -#define HAS_ARGBTORAWROW_SSSE3 -#define HAS_ARGBTORGB24ROW_SSSE3 -#define HAS_ARGBTORGB565DITHERROW_SSE2 -#define HAS_ARGBTORGB565ROW_SSE2 -#define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_ARGBTOYJROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 -#define HAS_COPYROW_ERMS -#define HAS_COPYROW_SSE2 -#define HAS_H422TOARGBROW_SSSE3 -#define HAS_HALFFLOATROW_SSE2 -#define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TOARGB4444ROW_SSSE3 -#define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TORGB24ROW_SSSE3 -#define HAS_I422TORGB565ROW_SSSE3 -#define HAS_I422TORGBAROW_SSSE3 -#define HAS_I422TOUYVYROW_SSE2 -#define HAS_I422TOYUY2ROW_SSE2 -#define HAS_I444TOARGBROW_SSSE3 -#define HAS_J400TOARGBROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 -#define HAS_MERGEUVROW_SSE2 -#define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORSPLITUVROW_SSSE3 -#define HAS_NV12TOARGBROW_SSSE3 -#define HAS_NV12TORGB24ROW_SSSE3 -#define HAS_NV12TORGB565ROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 -#define HAS_NV21TORGB24ROW_SSSE3 -#define HAS_RAWTOARGBROW_SSSE3 -#define HAS_RAWTORGB24ROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 -#define HAS_RGB24TOARGBROW_SSSE3 -#define HAS_RGB24TOYROW_SSSE3 -#define HAS_RGB24TOYJROW_SSSE3 -#define HAS_RAWTOYJROW_SSSE3 -#define HAS_RGB565TOARGBROW_SSE2 -#define HAS_RGBATOUVROW_SSSE3 -#define HAS_RGBATOYROW_SSSE3 -#define HAS_SETROW_ERMS -#define HAS_SETROW_X86 -#define HAS_SPLITUVROW_SSE2 -#define HAS_UYVYTOARGBROW_SSSE3 -#define HAS_UYVYTOUV422ROW_SSE2 -#define HAS_UYVYTOUVROW_SSE2 -#define HAS_UYVYTOYROW_SSE2 -#define HAS_YUY2TOARGBROW_SSSE3 -#define HAS_YUY2TOUV422ROW_SSE2 -#define HAS_YUY2TOUVROW_SSE2 -#define HAS_YUY2TOYROW_SSE2 - -// Effects: -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 -#define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYYTOALPHAROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSE2 -#define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBPOLYNOMIALROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_BLENDPLANEROW_SSSE3 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_RGBCOLORTABLEROW_X86 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELTOPLANEROW_SSE2 -#define HAS_SOBELXROW_SSE2 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSE2 - -// The following functions fail on gcc/clang 32 bit with fpic and framepointer. -// caveat: clangcl uses row_win.cc which works. -#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ - defined(_MSC_VER) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I444ALPHATOARGBROW_SSSE3 -#endif -#endif - -// The following are available on all x86 platforms, but -// require VS2012, clang 3.4 or gcc 4.7. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ - defined(GCC_HAS_AVX2)) -#define HAS_ARGBCOPYALPHAROW_AVX2 -#define HAS_ARGBCOPYYTOALPHAROW_AVX2 -#define HAS_ARGBEXTRACTALPHAROW_AVX2 -#define HAS_ARGBMIRRORROW_AVX2 -#define HAS_ARGBPOLYNOMIALROW_AVX2 -#define HAS_ARGBSHUFFLEROW_AVX2 -#define HAS_ARGBTORGB565DITHERROW_AVX2 -#define HAS_ARGBTOUVJROW_AVX2 -#define HAS_ARGBTOUVROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 -#define HAS_RGB24TOYJROW_AVX2 -#define HAS_RAWTOYJROW_AVX2 -#define HAS_COPYROW_AVX -#define HAS_H422TOARGBROW_AVX2 -#define HAS_HALFFLOATROW_AVX2 -// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast -#define HAS_I422TOARGB1555ROW_AVX2 -#define HAS_I422TOARGB4444ROW_AVX2 -#define HAS_I422TOARGBROW_AVX2 -#define HAS_I422TORGB24ROW_AVX2 -#define HAS_I422TORGB565ROW_AVX2 -#define HAS_I422TORGBAROW_AVX2 -#define HAS_I444TOARGBROW_AVX2 -#define HAS_INTERPOLATEROW_AVX2 -#define HAS_J422TOARGBROW_AVX2 -#define HAS_MERGEUVROW_AVX2 -#define HAS_MIRRORROW_AVX2 -#define HAS_NV12TOARGBROW_AVX2 -#define HAS_NV12TORGB24ROW_AVX2 -#define HAS_NV12TORGB565ROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_NV21TORGB24ROW_AVX2 -#define HAS_SPLITUVROW_AVX2 -#define HAS_UYVYTOARGBROW_AVX2 -#define HAS_UYVYTOUV422ROW_AVX2 -#define HAS_UYVYTOUVROW_AVX2 -#define HAS_UYVYTOYROW_AVX2 -#define HAS_YUY2TOARGBROW_AVX2 -#define HAS_YUY2TOUV422ROW_AVX2 -#define HAS_YUY2TOUVROW_AVX2 -#define HAS_YUY2TOYROW_AVX2 - -// Effects: -#define HAS_ARGBADDROW_AVX2 -#define HAS_ARGBATTENUATEROW_AVX2 -#define HAS_ARGBMULTIPLYROW_AVX2 -#define HAS_ARGBSUBTRACTROW_AVX2 -#define HAS_ARGBUNATTENUATEROW_AVX2 -#define HAS_BLENDPLANEROW_AVX2 - -#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ - defined(_MSC_VER) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#define HAS_I444ALPHATOARGBROW_AVX2 -#endif -#endif - -// The following are available for AVX2 Visual C 32 bit: -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(VISUALC_HAS_AVX2) -#define HAS_ARGB1555TOARGBROW_AVX2 -#define HAS_ARGB4444TOARGBROW_AVX2 -#define HAS_ARGBTOARGB1555ROW_AVX2 -#define HAS_ARGBTOARGB4444ROW_AVX2 -#define HAS_ARGBTORGB565ROW_AVX2 -#define HAS_J400TOARGBROW_AVX2 -#define HAS_RGB565TOARGBROW_AVX2 -#endif - -// The following are also available on x64 Visual C. -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \ - (!defined(__clang__) || defined(__SSSE3__)) -#define HAS_I444ALPHATOARGBROW_SSSE3 -#define HAS_I444TOARGBROW_SSSE3 -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I422TOARGBROW_SSSE3 -#endif - -// The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) -#define HAS_ABGRTOAR30ROW_SSSE3 -#define HAS_ARGBTOAR30ROW_SSSE3 -#define HAS_ARGBTOAR64ROW_SSSE3 -#define HAS_ARGBTOAB64ROW_SSSE3 -#define HAS_AR64TOARGBROW_SSSE3 -#define HAS_AB64TOARGBROW_SSSE3 -#define HAS_CONVERT16TO8ROW_SSSE3 -#define HAS_CONVERT8TO16ROW_SSE2 -#define HAS_HALFMERGEUVROW_SSSE3 -#define HAS_I210TOAR30ROW_SSSE3 -#define HAS_I210TOARGBROW_SSSE3 -#define HAS_I212TOAR30ROW_SSSE3 -#define HAS_I212TOARGBROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 -#define HAS_I422TOAR30ROW_SSSE3 -#define HAS_I410TOAR30ROW_SSSE3 -#define HAS_I410TOARGBROW_SSSE3 -#define HAS_MERGEARGBROW_SSE2 -#define HAS_MERGEXRGBROW_SSE2 -#define HAS_MERGERGBROW_SSSE3 -#define HAS_MIRRORUVROW_SSSE3 -#define HAS_P210TOAR30ROW_SSSE3 -#define HAS_P210TOARGBROW_SSSE3 -#define HAS_P410TOAR30ROW_SSSE3 -#define HAS_P410TOARGBROW_SSSE3 -#define HAS_RAWTORGBAROW_SSSE3 -#define HAS_RGB24MIRRORROW_SSSE3 -#define HAS_RGBATOYJROW_SSSE3 -#define HAS_SPLITARGBROW_SSE2 -#define HAS_SPLITARGBROW_SSSE3 -#define HAS_SPLITXRGBROW_SSE2 -#define HAS_SPLITXRGBROW_SSSE3 -#define HAS_SPLITRGBROW_SSSE3 -#define HAS_SWAPUVROW_SSSE3 - -#if defined(__x86_64__) || !defined(__pic__) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I210ALPHATOARGBROW_SSSE3 -#define HAS_I410ALPHATOARGBROW_SSSE3 -#endif -#endif - -// The following are available for AVX2 gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_ABGRTOAR30ROW_AVX2 -#define HAS_ABGRTOUVROW_AVX2 -#define HAS_ABGRTOYROW_AVX2 -#define HAS_ARGBTOAR30ROW_AVX2 -#define HAS_ARGBTORAWROW_AVX2 -#define HAS_ARGBTORGB24ROW_AVX2 -#define HAS_ARGBTOAR64ROW_AVX2 -#define HAS_ARGBTOAB64ROW_AVX2 -#define HAS_AR64TOARGBROW_AVX2 -#define HAS_AB64TOARGBROW_AVX2 -#define HAS_CONVERT16TO8ROW_AVX2 -#define HAS_CONVERT8TO16ROW_AVX2 -#define HAS_DIVIDEROW_16_AVX2 -#define HAS_HALFMERGEUVROW_AVX2 -#define HAS_MERGEAR64ROW_AVX2 -#define HAS_MERGEARGB16TO8ROW_AVX2 -#define HAS_MERGEARGBROW_AVX2 -#define HAS_MERGEXR30ROW_AVX2 -#define HAS_MERGEXR64ROW_AVX2 -#define HAS_MERGEXRGB16TO8ROW_AVX2 -#define HAS_MERGEXRGBROW_AVX2 -#define HAS_I210TOAR30ROW_AVX2 -#define HAS_I210TOARGBROW_AVX2 -#define HAS_I212TOAR30ROW_AVX2 -#define HAS_I212TOARGBROW_AVX2 -#define HAS_I400TOARGBROW_AVX2 -#define HAS_I410TOAR30ROW_AVX2 -#define HAS_I410TOARGBROW_AVX2 -#define HAS_P210TOAR30ROW_AVX2 -#define HAS_P210TOARGBROW_AVX2 -#define HAS_P410TOAR30ROW_AVX2 -#define HAS_P410TOARGBROW_AVX2 -#define HAS_I422TOAR30ROW_AVX2 -#define HAS_I422TOUYVYROW_AVX2 -#define HAS_I422TOYUY2ROW_AVX2 -#define HAS_MERGEUVROW_16_AVX2 -#define HAS_MIRRORUVROW_AVX2 -#define HAS_MULTIPLYROW_16_AVX2 -#define HAS_RGBATOYJROW_AVX2 -#define HAS_SPLITARGBROW_AVX2 -#define HAS_SPLITXRGBROW_AVX2 -#define HAS_SPLITUVROW_16_AVX2 -#define HAS_SWAPUVROW_AVX2 -// TODO(fbarchard): Fix AVX2 version of YUV24 -// #define HAS_NV21TOYUV24ROW_AVX2 - -#if defined(__x86_64__) || !defined(__pic__) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I210ALPHATOARGBROW_AVX2 -#define HAS_I410ALPHATOARGBROW_AVX2 -#endif -#endif - -// The following are available for AVX512 clang x86 platforms: -// TODO(fbarchard): Port to GCC and Visual C -// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX512)) -#define HAS_ARGBTORGB24ROW_AVX512VBMI -#endif - -// The following are available on Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_ABGRTOUVROW_NEON -#define HAS_ABGRTOYROW_NEON -#define HAS_ARGB1555TOARGBROW_NEON -#define HAS_ARGB1555TOUVROW_NEON -#define HAS_ARGB1555TOYROW_NEON -#define HAS_ARGB4444TOARGBROW_NEON -#define HAS_ARGB4444TOUVROW_NEON -#define HAS_ARGB4444TOYROW_NEON -#define HAS_ARGBEXTRACTALPHAROW_NEON -#define HAS_ARGBSETROW_NEON -#define HAS_ARGBTOARGB1555ROW_NEON -#define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTORAWROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORGB565DITHERROW_NEON -#define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTOAR64ROW_NEON -#define HAS_ARGBTOAB64ROW_NEON -#define HAS_AR64TOARGBROW_NEON -#define HAS_AB64TOARGBROW_NEON -#define HAS_ARGBTOUV444ROW_NEON -#define HAS_ARGBTOUVJROW_NEON -#define HAS_ARGBTOUVROW_NEON -#define HAS_ARGBTOYJROW_NEON -#define HAS_ARGBTOYROW_NEON -#define HAS_AYUVTOUVROW_NEON -#define HAS_AYUVTOVUROW_NEON -#define HAS_AYUVTOYROW_NEON -#define HAS_BGRATOUVROW_NEON -#define HAS_BGRATOYROW_NEON -#define HAS_BYTETOFLOATROW_NEON -#define HAS_COPYROW_NEON -#define HAS_DIVIDEROW_16_NEON -#define HAS_HALFFLOATROW_NEON -#define HAS_HALFMERGEUVROW_NEON -#define HAS_I400TOARGBROW_NEON -#define HAS_I444ALPHATOARGBROW_NEON -#define HAS_I422ALPHATOARGBROW_NEON -#define HAS_I422TOARGB1555ROW_NEON -#define HAS_I422TOARGB4444ROW_NEON -#define HAS_I422TOARGBROW_NEON -#define HAS_I422TORGB24ROW_NEON -#define HAS_I422TORGB565ROW_NEON -#define HAS_I422TORGBAROW_NEON -#define HAS_I422TOUYVYROW_NEON -#define HAS_I422TOYUY2ROW_NEON -#define HAS_I444TOARGBROW_NEON -#define HAS_J400TOARGBROW_NEON -#define HAS_MERGEAR64ROW_NEON -#define HAS_MERGEARGB16TO8ROW_NEON -#define HAS_MERGEARGBROW_NEON -#define HAS_MERGEXR30ROW_NEON -#define HAS_MERGEXR64ROW_NEON -#define HAS_MERGEXRGB16TO8ROW_NEON -#define HAS_MERGEXRGBROW_NEON -#define HAS_MERGEUVROW_NEON -#define HAS_MERGEUVROW_16_NEON -#define HAS_MIRRORROW_NEON -#define HAS_MIRRORUVROW_NEON -#define HAS_MIRRORSPLITUVROW_NEON -#define HAS_MULTIPLYROW_16_NEON -#define HAS_NV12TOARGBROW_NEON -#define HAS_NV12TORGB24ROW_NEON -#define HAS_NV12TORGB565ROW_NEON -#define HAS_NV21TOARGBROW_NEON -#define HAS_NV21TORGB24ROW_NEON -#define HAS_NV21TOYUV24ROW_NEON -#define HAS_RAWTOARGBROW_NEON -#define HAS_RAWTORGB24ROW_NEON -#define HAS_RAWTORGBAROW_NEON -#define HAS_RAWTOUVROW_NEON -#define HAS_RAWTOYJROW_NEON -#define HAS_RAWTOYROW_NEON -#define HAS_RGB24TOARGBROW_NEON -#define HAS_RGB24TOUVROW_NEON -#define HAS_RGB24TOYJROW_NEON -#define HAS_RGB24TOYROW_NEON -#define HAS_RGB565TOARGBROW_NEON -#define HAS_RGB565TOUVROW_NEON -#define HAS_RGB565TOYROW_NEON -#define HAS_RGBATOUVROW_NEON -#define HAS_RGBATOYJROW_NEON -#define HAS_RGBATOYROW_NEON -#define HAS_SETROW_NEON -#define HAS_SPLITARGBROW_NEON -#define HAS_SPLITXRGBROW_NEON -#define HAS_SPLITRGBROW_NEON -#define HAS_SPLITUVROW_NEON -#define HAS_SPLITUVROW_16_NEON -#define HAS_SWAPUVROW_NEON -#define HAS_UYVYTOARGBROW_NEON -#define HAS_UYVYTOUV422ROW_NEON -#define HAS_UYVYTOUVROW_NEON -#define HAS_UYVYTOYROW_NEON -#define HAS_YUY2TOARGBROW_NEON -#define HAS_YUY2TOUV422ROW_NEON -#define HAS_YUY2TOUVROW_NEON -#define HAS_YUY2TOYROW_NEON - -// Effects: -#define HAS_ARGBADDROW_NEON -#define HAS_ARGBATTENUATEROW_NEON -#define HAS_ARGBBLENDROW_NEON -#define HAS_ARGBCOLORMATRIXROW_NEON -#define HAS_ARGBGRAYROW_NEON -#define HAS_ARGBMIRRORROW_NEON -#define HAS_RGB24MIRRORROW_NEON -#define HAS_ARGBMULTIPLYROW_NEON -#define HAS_ARGBQUANTIZEROW_NEON -#define HAS_ARGBSEPIAROW_NEON -#define HAS_ARGBSHADEROW_NEON -#define HAS_ARGBSHUFFLEROW_NEON -#define HAS_ARGBSUBTRACTROW_NEON -#define HAS_INTERPOLATEROW_NEON -#define HAS_SOBELROW_NEON -#define HAS_SOBELTOPLANEROW_NEON -#define HAS_SOBELXROW_NEON -#define HAS_SOBELXYROW_NEON -#define HAS_SOBELYROW_NEON -#endif - -// The following are available on AArch64 platforms: -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#define HAS_SCALESUMSAMPLES_NEON -#define HAS_GAUSSROW_F32_NEON -#define HAS_GAUSSCOL_F32_NEON - -#endif -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_ABGRTOUVROW_MSA -#define HAS_ABGRTOYROW_MSA -#define HAS_ARGB1555TOARGBROW_MSA -#define HAS_ARGB1555TOUVROW_MSA -#define HAS_ARGB1555TOYROW_MSA -#define HAS_ARGB4444TOARGBROW_MSA -#define HAS_ARGBADDROW_MSA -#define HAS_ARGBATTENUATEROW_MSA -#define HAS_ARGBBLENDROW_MSA -#define HAS_ARGBCOLORMATRIXROW_MSA -#define HAS_ARGBEXTRACTALPHAROW_MSA -#define HAS_ARGBGRAYROW_MSA -#define HAS_ARGBMIRRORROW_MSA -#define HAS_ARGBMULTIPLYROW_MSA -#define HAS_ARGBQUANTIZEROW_MSA -#define HAS_ARGBSEPIAROW_MSA -#define HAS_ARGBSETROW_MSA -#define HAS_ARGBSHADEROW_MSA -#define HAS_ARGBSHUFFLEROW_MSA -#define HAS_ARGBSUBTRACTROW_MSA -#define HAS_ARGBTOARGB1555ROW_MSA -#define HAS_ARGBTOARGB4444ROW_MSA -#define HAS_ARGBTORAWROW_MSA -#define HAS_ARGBTORGB24ROW_MSA -#define HAS_ARGBTORGB565DITHERROW_MSA -#define HAS_ARGBTORGB565ROW_MSA -#define HAS_ARGBTOUV444ROW_MSA -#define HAS_ARGBTOUVJROW_MSA -#define HAS_ARGBTOUVROW_MSA -#define HAS_ARGBTOYJROW_MSA -#define HAS_ARGBTOYROW_MSA -#define HAS_BGRATOUVROW_MSA -#define HAS_BGRATOYROW_MSA -#define HAS_HALFFLOATROW_MSA -#define HAS_I400TOARGBROW_MSA -#define HAS_I422TOUYVYROW_MSA -#define HAS_I422TOYUY2ROW_MSA -#define HAS_INTERPOLATEROW_MSA -#define HAS_J400TOARGBROW_MSA -#define HAS_MERGEUVROW_MSA -#define HAS_MIRRORROW_MSA -#define HAS_MIRRORUVROW_MSA -#define HAS_MIRRORSPLITUVROW_MSA -#define HAS_RAWTOARGBROW_MSA -#define HAS_RAWTORGB24ROW_MSA -#define HAS_RAWTOUVROW_MSA -#define HAS_RAWTOYROW_MSA -#define HAS_RGB24TOARGBROW_MSA -#define HAS_RGB24TOUVROW_MSA -#define HAS_RGB24TOYROW_MSA -#define HAS_RGB565TOARGBROW_MSA -#define HAS_RGB565TOUVROW_MSA -#define HAS_RGB565TOYROW_MSA -#define HAS_RGBATOUVROW_MSA -#define HAS_RGBATOYROW_MSA -#define HAS_SETROW_MSA -#define HAS_SOBELROW_MSA -#define HAS_SOBELTOPLANEROW_MSA -#define HAS_SOBELXROW_MSA -#define HAS_SOBELXYROW_MSA -#define HAS_SOBELYROW_MSA -#define HAS_SPLITUVROW_MSA -#define HAS_UYVYTOUVROW_MSA -#define HAS_UYVYTOYROW_MSA -#define HAS_YUY2TOUV422ROW_MSA -#define HAS_YUY2TOUVROW_MSA -#define HAS_YUY2TOYROW_MSA -#endif - -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) -#define HAS_ABGRTOUVROW_MMI -#define HAS_ABGRTOYROW_MMI -#define HAS_ARGB1555TOARGBROW_MMI -#define HAS_ARGB1555TOUVROW_MMI -#define HAS_ARGB1555TOYROW_MMI -#define HAS_ARGB4444TOARGBROW_MMI -#define HAS_ARGB4444TOUVROW_MMI -#define HAS_ARGB4444TOYROW_MMI -#define HAS_ARGBADDROW_MMI -#define HAS_ARGBATTENUATEROW_MMI -#define HAS_ARGBBLENDROW_MMI -#define HAS_ARGBCOLORMATRIXROW_MMI -#define HAS_ARGBCOPYALPHAROW_MMI -#define HAS_ARGBCOPYYTOALPHAROW_MMI -#define HAS_ARGBEXTRACTALPHAROW_MMI -#define HAS_ARGBGRAYROW_MMI -#define HAS_ARGBMIRRORROW_MMI -#define HAS_ARGBMULTIPLYROW_MMI -#define HAS_ARGBSEPIAROW_MMI -#define HAS_ARGBSETROW_MMI -#define HAS_ARGBSHADEROW_MMI -#define HAS_ARGBSHUFFLEROW_MMI -#define HAS_ARGBSUBTRACTROW_MMI -#define HAS_ARGBTOARGB1555ROW_MMI -#define HAS_ARGBTOARGB4444ROW_MMI -#define HAS_ARGBTORAWROW_MMI -#define HAS_ARGBTORGB24ROW_MMI -#define HAS_ARGBTORGB565DITHERROW_MMI -#define HAS_ARGBTORGB565ROW_MMI -#define HAS_ARGBTOUV444ROW_MMI -#define HAS_ARGBTOUVJROW_MMI -#define HAS_ARGBTOUVROW_MMI -#define HAS_ARGBTOYJROW_MMI -#define HAS_ARGBTOYROW_MMI -#define HAS_BGRATOUVROW_MMI -#define HAS_BGRATOYROW_MMI -#define HAS_BLENDPLANEROW_MMI -#define HAS_COMPUTECUMULATIVESUMROW_MMI -#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI -#define HAS_HALFFLOATROW_MMI -#define HAS_I400TOARGBROW_MMI -#define HAS_I422TOUYVYROW_MMI -#define HAS_I422TOYUY2ROW_MMI -#define HAS_INTERPOLATEROW_MMI -#define HAS_J400TOARGBROW_MMI -#define HAS_MERGERGBROW_MMI -#define HAS_MERGEUVROW_MMI -#define HAS_MIRRORROW_MMI -#define HAS_MIRRORSPLITUVROW_MMI -#define HAS_RAWTOARGBROW_MMI -#define HAS_RAWTORGB24ROW_MMI -#define HAS_RAWTOUVROW_MMI -#define HAS_RAWTOYROW_MMI -#define HAS_RGB24TOARGBROW_MMI -#define HAS_RGB24TOUVROW_MMI -#define HAS_RGB24TOYROW_MMI -#define HAS_RGB565TOARGBROW_MMI -#define HAS_RGB565TOUVROW_MMI -#define HAS_RGB565TOYROW_MMI -#define HAS_RGBATOUVROW_MMI -#define HAS_RGBATOYROW_MMI -#define HAS_SOBELROW_MMI -#define HAS_SOBELTOPLANEROW_MMI -#define HAS_SOBELXROW_MMI -#define HAS_SOBELXYROW_MMI -#define HAS_SOBELYROW_MMI -#define HAS_SPLITRGBROW_MMI -#define HAS_SPLITUVROW_MMI -#define HAS_UYVYTOUVROW_MMI -#define HAS_UYVYTOYROW_MMI -#define HAS_YUY2TOUV422ROW_MMI -#define HAS_YUY2TOUVROW_MMI -#define HAS_YUY2TOYROW_MMI -#endif - -#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) -#if defined(VISUALC_HAS_AVX2) -#define SIMD_ALIGNED(var) __declspec(align(32)) var -#else -#define SIMD_ALIGNED(var) __declspec(align(16)) var -#endif -#define LIBYUV_NOINLINE __declspec(noinline) -typedef __declspec(align(16)) int16_t vec16[8]; -typedef __declspec(align(16)) int32_t vec32[4]; -typedef __declspec(align(16)) float vecf32[4]; -typedef __declspec(align(16)) int8_t vec8[16]; -typedef __declspec(align(16)) uint16_t uvec16[8]; -typedef __declspec(align(16)) uint32_t uvec32[4]; -typedef __declspec(align(16)) uint8_t uvec8[16]; -typedef __declspec(align(32)) int16_t lvec16[16]; -typedef __declspec(align(32)) int32_t lvec32[8]; -typedef __declspec(align(32)) int8_t lvec8[32]; -typedef __declspec(align(32)) uint16_t ulvec16[16]; -typedef __declspec(align(32)) uint32_t ulvec32[8]; -typedef __declspec(align(32)) uint8_t ulvec8[32]; -#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) -// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. -#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) -#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) -#else -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#endif -#define LIBYUV_NOINLINE __attribute__((noinline)) -typedef int16_t __attribute__((vector_size(16))) vec16; -typedef int32_t __attribute__((vector_size(16))) vec32; -typedef float __attribute__((vector_size(16))) vecf32; -typedef int8_t __attribute__((vector_size(16))) vec8; -typedef uint16_t __attribute__((vector_size(16))) uvec16; -typedef uint32_t __attribute__((vector_size(16))) uvec32; -typedef uint8_t __attribute__((vector_size(16))) uvec8; -typedef int16_t __attribute__((vector_size(32))) lvec16; -typedef int32_t __attribute__((vector_size(32))) lvec32; -typedef int8_t __attribute__((vector_size(32))) lvec8; -typedef uint16_t __attribute__((vector_size(32))) ulvec16; -typedef uint32_t __attribute__((vector_size(32))) ulvec32; -typedef uint8_t __attribute__((vector_size(32))) ulvec8; -#else -#define SIMD_ALIGNED(var) var -#define LIBYUV_NOINLINE -typedef int16_t vec16[8]; -typedef int32_t vec32[4]; -typedef float vecf32[4]; -typedef int8_t vec8[16]; -typedef uint16_t uvec16[8]; -typedef uint32_t uvec32[4]; -typedef uint8_t uvec8[16]; -typedef int16_t lvec16[16]; -typedef int32_t lvec32[8]; -typedef int8_t lvec8[32]; -typedef uint16_t ulvec16[16]; -typedef uint32_t ulvec32[8]; -typedef uint8_t ulvec8[32]; -#endif - -#if defined(__aarch64__) || defined(__arm__) -// This struct is for ARM color conversion. -struct YuvConstants { - uvec8 kUVCoeff; - vec16 kRGBCoeffBias; -}; -#else -// This struct is for Intel color conversion. -struct YuvConstants { - uint8_t kUVToB[32]; - uint8_t kUVToG[32]; - uint8_t kUVToR[32]; - int16_t kYToRgb[16]; - int16_t kYBiasToRgb[16]; -}; - -// Offsets into YuvConstants structure -#define KUVTOB 0 -#define KUVTOG 32 -#define KUVTOR 64 -#define KYTORGB 96 -#define KYBIASTORGB 128 - -#endif - -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) - -#define align_buffer_64(var, size) \ - uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ - uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ - -#define free_aligned_buffer_64(var) \ - free(var##_mem); \ - var = 0 - -#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) -#define OMITFP -#else -#define OMITFP __attribute__((optimize("omit-frame-pointer"))) -#endif - -// NaCL macros for GCC x86 and x64. -#if defined(__native_client__) -#define LABELALIGN ".p2align 5\n" -#else -#define LABELALIGN -#endif - -// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be -// measured and then run with iaca -64 libyuv_unittest. -// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within -// inline assembly blocks. -// example of iaca: -// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest - -#if defined(__x86_64__) || defined(__i386__) - -#define IACA_ASM_START \ - ".byte 0x0F, 0x0B\n" \ - " movl $111, %%ebx\n" \ - ".byte 0x64, 0x67, 0x90\n" - -#define IACA_ASM_END \ - " movl $222, %%ebx\n" \ - ".byte 0x64, 0x67, 0x90\n" \ - ".byte 0x0F, 0x0B\n" - -#define IACA_SSC_MARK(MARK_ID) \ - __asm__ __volatile__("\n\t movl $" #MARK_ID \ - ", %%ebx" \ - "\n\t .byte 0x64, 0x67, 0x90" \ - : \ - : \ - : "memory"); - -#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); - -#else /* Visual C */ -#define IACA_UD_BYTES \ - { __asm _emit 0x0F __asm _emit 0x0B } - -#define IACA_SSC_MARK(x) \ - { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } - -#define IACA_VC64_START __writegsbyte(111, 111); -#define IACA_VC64_END __writegsbyte(222, 222); -#endif - -#define IACA_START \ - { \ - IACA_UD_BYTES \ - IACA_SSC_MARK(111) \ - } -#define IACA_END \ - { \ - IACA_SSC_MARK(222) \ - IACA_UD_BYTES \ - } - -void I444ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); -void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void I422ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); -void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); -void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); -void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_MSA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_MMI(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_NEON(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_NEON(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width); -void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); -void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width); -void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width); -void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); -void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); -void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); -void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); -void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); - -void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); -void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); -void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); -void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB24ToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RAWToUVRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGB565ToUVRow_C(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void ARGBToUV444Row_C(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); - -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorSplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorSplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorSplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void MirrorSplitUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); - -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width); -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width); -void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); -void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void SplitUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_AVX2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SplitUVRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void MergeUVRow_C(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_MSA(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_MMI(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); -void MergeUVRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void MergeUVRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void MergeUVRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void MergeUVRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void MergeUVRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); - -void HalfMergeUVRow_C(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); - -void HalfMergeUVRow_NEON(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); - -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); - -void HalfMergeUVRow_AVX2(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); - -void SplitRGBRow_C(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_MMI(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); - -void MergeRGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeRGBRow_MMI(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void MergeRGBRow_Any_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeRGBRow_Any_MMI(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); -void MergeARGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); -void MergeARGBRow_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); -void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - int width); -void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - int width); -void MergeARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - int width); -void SplitARGBRow_C(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void MergeXRGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); -void MergeXRGBRow_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); -void MergeXRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); -void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void MergeXRGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void SplitXRGBRow_C(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); - -void MergeXR30Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width); -void MergeAR64Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width); -void MergeARGB16To8Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR64Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width); -void MergeXRGB16To8Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR30Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width); -void MergeAR64Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width); -void MergeARGB16To8Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR64Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width); -void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR30Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width); -void MergeXR30Row_10_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int /* depth */, - int width); -void MergeAR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width); -void MergeARGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width); -void MergeXRGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width); -void MergeXR30Row_Any_AVX2(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeAR64Row_Any_AVX2(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - const uint16_t* a_buf, - uint16_t* dst_ptr, - int depth, - int width); -void MergeXR64Row_Any_AVX2(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint16_t* dst_ptr, - int depth, - int width); -void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeXR30Row_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeAR64Row_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - const uint16_t* a_buf, - uint16_t* dst_ptr, - int depth, - int width); -void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - int depth, - int width); -void MergeXR64Row_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint16_t* dst_ptr, - int depth, - int width); -void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf, - const uint16_t* g_buf, - const uint16_t* b_buf, - uint8_t* dst_ptr, - int depth, - int width); - -void MergeUVRow_16_C(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); -void MergeUVRow_16_AVX2(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); -void MergeUVRow_16_Any_AVX2(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); -void MergeUVRow_16_Any_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); - -void SplitUVRow_16_C(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width); -void SplitUVRow_16_AVX2(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width); -void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width); -void SplitUVRow_16_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width); -void SplitUVRow_16_Any_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width); - -void MultiplyRow_16_C(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void MultiplyRow_16_AVX2(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); -void MultiplyRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); - -void DivideRow_16_C(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void DivideRow_16_AVX2(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void DivideRow_16_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); -void DivideRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void DivideRow_16_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); - -void Convert8To16Row_C(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void Convert8To16Row_AVX2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width); -void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); -void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); - -void Convert16To8Row_C(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); -void Convert16To8Row_AVX2(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); -void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int scale, - int width); -void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int scale, - int width); - -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); -void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); -void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); -void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); -void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); -void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); - -void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); - -void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width); -void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_a, - int width); -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width); -void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, - uint8_t* dst_a, - int width); -void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, - uint8_t* dst_a, - int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void SetRow_C(uint8_t* dst, uint8_t v8, int width); -void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); -void SetRow_X86(uint8_t* dst, uint8_t v8, int width); -void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width); -void SetRow_NEON(uint8_t* dst, uint8_t v8, int width); -void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width); -void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width); - -void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width); -void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width); -void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width); -void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width); -void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width); -void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width); -void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width); -void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width); - -// ARGBShufflers for BGRAToARGB etc. -void ARGBShuffleRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); -void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); -void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); -void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); -void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); -void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); - -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width); -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width); -void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width); -void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width); - -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width); -void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); -void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width); -void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width); -void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width); -void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width); -void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width); -void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width); -void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width); -void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width); -void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width); -void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width); -void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); -void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width); -void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width); -void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width); -void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); -void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); -void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); - -void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); - -void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); - -void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); - -void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width); -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - const uint32_t dither4, - int width); -void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, - uint8_t* dst, - const uint32_t dither4, - int width); - -void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); - -void ARGBToRGB24Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width); -void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width); -void ARGBToRGB565Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb565, - int width); -void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb1555, - int width); -void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb4444, - int width); -void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width); -void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width); - -void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); -void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width); - -void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); -void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); - -void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width); -void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width); -void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width); -void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width); -void AR64ShuffleRow_C(const uint8_t* src_ar64, - uint8_t* dst_ar64, - const uint8_t* shuffler, - int width); -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width); -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width); -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width); -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width); -void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); -void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); -void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); -void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width); -void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); -void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); -void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); -void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); -void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); - -void I444ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToYUV24Row_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); -void YUY2ToARGBRow_C(const uint8_t* src_yuy2, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_C(const uint8_t* src_uyvy, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); - -void I422ToRGBARow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToYUV24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); -void NV12ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); - -void I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); - -void I400ToARGBRow_C(const uint8_t* src_y, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_AVX2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_NEON(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_MSA(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_MMI(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* param, - int width); -void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* param, - int width); -void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* param, - int width); -void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); - -// ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBBlendRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBBlendRow_MSA(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBBlendRow_MMI(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBBlendRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); - -// Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); -void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void BlendPlaneRow_AVX2(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); -void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void BlendPlaneRow_MMI(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); -void BlendPlaneRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void BlendPlaneRow_C(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); - -// ARGB multiply images. Same API as Blend, but these require -// pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBMultiplyRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); - -// ARGB add images. -void ARGBAddRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBAddRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBAddRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBAddRow_MSA(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBAddRow_MMI(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); - -// ARGB subtract images. Same API as Blend, but these require -// pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBSubtractRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBSubtractRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBSubtractRow_MSA(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void ARGBSubtractRow_MMI(const uint8_t* src_argb0, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); - -void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); -void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); - -void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); -void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); - -void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); - -void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_Any_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_Any_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_Any_NEON(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGBRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); - -void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_C(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_C(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_NEON(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_MSA(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUVRow_MMI(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); - -void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_C(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_C(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr, - int src_stride_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToUVRow_C(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width); -void AYUVToVURow_C(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width); -void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width); -void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width); -void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_vu, - int width); -void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_vu, - int width); - -void I422ToYUY2Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width); -void I422ToUYVYRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width); -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToYUY2Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToYUY2Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToYUY2Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToYUY2Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToUYVYRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); - -// Effects related row functions. -void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBAttenuateRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBAttenuateRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBAttenuateRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -// Inverse table for unattenuate, shared by C and SSE2. -extern const uint32_t fixed_invtbl8[256]; -void ARGBUnattenuateRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); - -void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); - -void ARGBSepiaRow_C(uint8_t* dst_argb, int width); -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); -void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); -void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); -void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width); - -void ARGBColorMatrixRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); -void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); -void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); -void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); - -void ARGBColorTableRow_C(uint8_t* dst_argb, - const uint8_t* table_argb, - int width); -void ARGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width); - -void RGBColorTableRow_C(uint8_t* dst_argb, - const uint8_t* table_argb, - int width); -void RGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width); - -void ARGBQuantizeRow_C(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); -void ARGBQuantizeRow_NEON(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); -void ARGBQuantizeRow_MSA(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); - -void ARGBShadeRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); -void ARGBShadeRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); -void ARGBShadeRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); -void ARGBShadeRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); - -// Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count); -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width); - -void ComputeCumulativeSumRow_MMI(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width); - -void CumulativeSumToAverageRow_C(const int32_t* tl, - const int32_t* bl, - int w, - int area, - uint8_t* dst, - int count); -void ComputeCumulativeSumRow_C(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width); - -LIBYUV_API -void ARGBAffineRow_C(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width); -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width); - -// Used for I420Scale, ARGBScale, and ARGBInterpolate. -void InterpolateRow_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction); -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); -void InterpolateRow_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); -void InterpolateRow_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); -void InterpolateRow_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction); -void InterpolateRow_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction); -void InterpolateRow_Any_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); -void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); -void InterpolateRow_Any_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); -void InterpolateRow_Any_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); - -void InterpolateRow_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction); - -// Sobel images. -void SobelXRow_C(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); -void SobelXRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); -void SobelXRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); -void SobelXRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); -void SobelYRow_C(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); -void SobelYRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); -void SobelYRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); -void SobelYRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); -void SobelRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelToPlaneRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); -void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); -void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); -void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); -void SobelXYRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelXYRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelXYRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelXYRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); -void SobelRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelXYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelXYRow_Any_NEON(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelXYRow_Any_MSA(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); -void SobelXYRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); - -void ARGBPolynomialRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width); -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width); -void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width); - -// Scale and convert to half float. -void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); -void HalfFloatRow_AVX2(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); -void HalfFloatRow_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloat1Row_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloat1Row_Any_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); -void HalfFloat1Row_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); -void HalfFloatRow_MSA(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); -void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); -void ByteToFloatRow_NEON(const uint8_t* src, - float* dst, - float scale, - int width); -void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, - float* dst_ptr, - float param, - int width); - -void ARGBLumaColorTableRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff); - -float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); -float ScaleMaxSamples_NEON(const float* src, - float* dst, - float scale, - int width); -float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); -float ScaleSumSamples_NEON(const float* src, - float* dst, - float scale, - int width); -void ScaleSamples_C(const float* src, float* dst, float scale, int width); -void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); - -void I210ToARGBRow_MMI(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_Any_MMI(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); - -void GaussRow_F32_NEON(const float* src, float* dst, int width); -void GaussRow_F32_C(const float* src, float* dst, int width); - -void GaussCol_F32_NEON(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, - int width); - -void GaussCol_F32_C(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, - int width); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_ROW_H_ diff --git a/thirdparty/libyuv/include/libyuv/scale.h b/thirdparty/libyuv/include/libyuv/scale.h deleted file mode 100644 index 3d4b600..0000000 --- a/thirdparty/libyuv/include/libyuv/scale.h +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_H_ -#define INCLUDE_LIBYUV_SCALE_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Supported filtering. -typedef enum FilterMode { - kFilterNone = 0, // Point sample; Fastest. - kFilterLinear = 1, // Filter horizontally only. - kFilterBilinear = 2, // Faster than box, but lower quality scaling down. - kFilterBox = 3 // Highest quality. -} FilterModeEnum; - -// Scale a YUV plane. -LIBYUV_API -void ScalePlane(const uint8_t* src, - int src_stride, - int src_width, - int src_height, - uint8_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering); - -LIBYUV_API -void ScalePlane_16(const uint16_t* src, - int src_stride, - int src_width, - int src_height, - uint16_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Sample is expected to be in the low 12 bits. -LIBYUV_API -void ScalePlane_12(const uint16_t* src, - int src_stride, - int src_width, - int src_height, - uint16_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Scales a YUV 4:2:0 image from the src width and height to the -// dst width and height. -// If filtering is kFilterNone, a simple nearest-neighbor algorithm is -// used. This produces basic (blocky) quality at the fastest speed. -// If filtering is kFilterBilinear, interpolation is used to produce a better -// quality image, at the expense of speed. -// If filtering is kFilterBox, averaging is used to produce ever better -// quality image, at further expense of speed. -// Returns 0 if successful. - -LIBYUV_API -int I420Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -LIBYUV_API -int I420Scale_16(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -LIBYUV_API -int I420Scale_12(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Scales a YUV 4:4:4 image from the src width and height to the -// dst width and height. -// If filtering is kFilterNone, a simple nearest-neighbor algorithm is -// used. This produces basic (blocky) quality at the fastest speed. -// If filtering is kFilterBilinear, interpolation is used to produce a better -// quality image, at the expense of speed. -// If filtering is kFilterBox, averaging is used to produce ever better -// quality image, at further expense of speed. -// Returns 0 if successful. - -LIBYUV_API -int I444Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -LIBYUV_API -int I444Scale_16(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -LIBYUV_API -int I444Scale_12(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Scales an NV12 image from the src width and height to the -// dst width and height. -// If filtering is kFilterNone, a simple nearest-neighbor algorithm is -// used. This produces basic (blocky) quality at the fastest speed. -// If filtering is kFilterBilinear, interpolation is used to produce a better -// quality image, at the expense of speed. -// kFilterBox is not supported for the UV channel and will be treated as -// bilinear. -// Returns 0 if successful. - -LIBYUV_API -int NV12Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering); - -#ifdef __cplusplus -// Legacy API. Deprecated. -LIBYUV_API -int Scale(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - int src_stride_y, - int src_stride_u, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - uint8_t* dst_u, - uint8_t* dst_v, - int dst_stride_y, - int dst_stride_u, - int dst_stride_v, - int dst_width, - int dst_height, - LIBYUV_BOOL interpolate); - -// For testing, allow disabling of specialized scalers. -LIBYUV_API -void SetUseReferenceImpl(LIBYUV_BOOL use); -#endif // __cplusplus - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/thirdparty/libyuv/include/libyuv/scale_argb.h b/thirdparty/libyuv/include/libyuv/scale_argb.h deleted file mode 100644 index 7641f18..0000000 --- a/thirdparty/libyuv/include/libyuv/scale_argb.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ -#define INCLUDE_LIBYUV_SCALE_ARGB_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/scale.h" // For FilterMode - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -LIBYUV_API -int ARGBScale(const uint8_t* src_argb, - int src_stride_argb, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Clipped scale takes destination rectangle coordinates for clip values. -LIBYUV_API -int ARGBScaleClip(const uint8_t* src_argb, - int src_stride_argb, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering); - -// Scale with YUV conversion to ARGB and clipping. -LIBYUV_API -int YUVToARGBScaleClip(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint32_t src_fourcc, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - uint32_t dst_fourcc, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ diff --git a/thirdparty/libyuv/include/libyuv/scale_row.h b/thirdparty/libyuv/include/libyuv/scale_row.h deleted file mode 100644 index 833af1c..0000000 --- a/thirdparty/libyuv/include/libyuv/scale_row.h +++ /dev/null @@ -1,1727 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ -#define INCLUDE_LIBYUV_SCALE_ROW_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/scale.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) -#define CLANG_HAS_AVX2 1 -#endif // clang >= 3.4 -#endif // __clang__ - -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ - _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_FIXEDDIV1_X86 -#define HAS_FIXEDDIV_X86 -#define HAS_SCALEADDROW_SSE2 -#define HAS_SCALEARGBCOLS_SSE2 -#define HAS_SCALEARGBCOLSUP2_SSE2 -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -#define HAS_SCALEARGBROWDOWN2_SSE2 -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -#define HAS_SCALECOLSUP2_SSE2 -#define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSSE3 -#define HAS_SCALEROWDOWN34_SSSE3 -#define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSSE3 -#endif - -// The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) -#define HAS_SCALEUVROWDOWN2BOX_SSSE3 -#define HAS_SCALEROWUP2LINEAR_SSE2 -#define HAS_SCALEROWUP2LINEAR_SSSE3 -#define HAS_SCALEROWUP2BILINEAR_SSE2 -#define HAS_SCALEROWUP2BILINEAR_SSSE3 -#define HAS_SCALEROWUP2LINEAR_12_SSSE3 -#define HAS_SCALEROWUP2BILINEAR_12_SSSE3 -#define HAS_SCALEROWUP2LINEAR_16_SSE2 -#define HAS_SCALEROWUP2BILINEAR_16_SSE2 -#define HAS_SCALEUVROWUP2LINEAR_SSSE3 -#define HAS_SCALEUVROWUP2BILINEAR_SSSE3 -#define HAS_SCALEUVROWUP2LINEAR_16_SSE2 -#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2 -#endif - -// The following are available for gcc/clang x86 platforms, but -// require clang 3.4 or gcc 4.7. -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_SCALEUVROWDOWN2BOX_AVX2 -#define HAS_SCALEROWUP2LINEAR_AVX2 -#define HAS_SCALEROWUP2BILINEAR_AVX2 -#define HAS_SCALEROWUP2LINEAR_12_AVX2 -#define HAS_SCALEROWUP2BILINEAR_12_AVX2 -#define HAS_SCALEROWUP2LINEAR_16_AVX2 -#define HAS_SCALEROWUP2BILINEAR_16_AVX2 -#define HAS_SCALEUVROWUP2LINEAR_AVX2 -#define HAS_SCALEUVROWUP2BILINEAR_AVX2 -#define HAS_SCALEUVROWUP2LINEAR_16_AVX2 -#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2 -#endif - -// The following are available on all x86 platforms, but -// require VS2012, clang 3.4 or gcc 4.7. -// The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ - defined(GCC_HAS_AVX2)) -#define HAS_SCALEADDROW_AVX2 -#define HAS_SCALEROWDOWN2_AVX2 -#define HAS_SCALEROWDOWN4_AVX2 -#endif - -// The following are available on Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SCALEADDROW_NEON -#define HAS_SCALEARGBCOLS_NEON -#define HAS_SCALEARGBFILTERCOLS_NEON -#define HAS_SCALEARGBROWDOWN2_NEON -#define HAS_SCALEARGBROWDOWNEVEN_NEON -#define HAS_SCALEFILTERCOLS_NEON -#define HAS_SCALEROWDOWN2_NEON -#define HAS_SCALEROWDOWN34_NEON -#define HAS_SCALEROWDOWN38_NEON -#define HAS_SCALEROWDOWN4_NEON -#define HAS_SCALEUVROWDOWN2BOX_NEON -#define HAS_SCALEUVROWDOWNEVEN_NEON -#define HAS_SCALEROWUP2LINEAR_NEON -#define HAS_SCALEROWUP2BILINEAR_NEON -#define HAS_SCALEROWUP2LINEAR_12_NEON -#define HAS_SCALEROWUP2BILINEAR_12_NEON -#define HAS_SCALEROWUP2LINEAR_16_NEON -#define HAS_SCALEROWUP2BILINEAR_16_NEON -#define HAS_SCALEUVROWUP2LINEAR_NEON -#define HAS_SCALEUVROWUP2BILINEAR_NEON -#define HAS_SCALEUVROWUP2LINEAR_16_NEON -#define HAS_SCALEUVROWUP2BILINEAR_16_NEON -#endif - -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_SCALEADDROW_MSA -#define HAS_SCALEARGBCOLS_MSA -#define HAS_SCALEARGBFILTERCOLS_MSA -#define HAS_SCALEARGBROWDOWN2_MSA -#define HAS_SCALEARGBROWDOWNEVEN_MSA -#define HAS_SCALEFILTERCOLS_MSA -#define HAS_SCALEROWDOWN2_MSA -#define HAS_SCALEROWDOWN34_MSA -#define HAS_SCALEROWDOWN38_MSA -#define HAS_SCALEROWDOWN4_MSA -#endif - -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) -#define HAS_FIXEDDIV1_MIPS -#define HAS_FIXEDDIV_MIPS -#define HAS_SCALEADDROW_16_MMI -#define HAS_SCALEADDROW_MMI -#define HAS_SCALEARGBCOLS_MMI -#define HAS_SCALEARGBCOLSUP2_MMI -#define HAS_SCALEARGBROWDOWN2_MMI -#define HAS_SCALEARGBROWDOWNEVEN_MMI -#define HAS_SCALECOLS_16_MMI -#define HAS_SCALECOLS_MMI -#define HAS_SCALEROWDOWN2_16_MMI -#define HAS_SCALEROWDOWN2_MMI -#define HAS_SCALEROWDOWN4_16_MMI -#define HAS_SCALEROWDOWN4_MMI -#define HAS_SCALEROWDOWN34_MMI -#endif - -// Scale ARGB vertically with bilinear interpolation. -void ScalePlaneVertical(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int y, - int dy, - int bpp, - enum FilterMode filtering); - -void ScalePlaneVertical_16(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_argb, - uint16_t* dst_argb, - int x, - int y, - int dy, - int wpp, - enum FilterMode filtering); - -// Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_C(int num, int div); -int FixedDiv_X86(int num, int div); -int FixedDiv_MIPS(int num, int div); -// Divide num - 1 by div - 1 and return as 16.16 fixed point result. -int FixedDiv1_C(int num, int div); -int FixedDiv1_X86(int num, int div); -int FixedDiv1_MIPS(int num, int div); -#ifdef HAS_FIXEDDIV_X86 -#define FixedDiv FixedDiv_X86 -#define FixedDiv1 FixedDiv1_X86 -#elif defined HAS_FIXEDDIV_MIPS -#define FixedDiv FixedDiv_MIPS -#define FixedDiv1 FixedDiv1_MIPS -#else -#define FixedDiv FixedDiv_C -#define FixedDiv1 FixedDiv1_C -#endif - -// Compute slope values for stepping. -void ScaleSlope(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering, - int* x, - int* y, - int* dx, - int* dy); - -void ScaleRowDown2_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown2Linear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown2Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown4_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown4Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown34_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown34_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width); -void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* d, - int dst_width); -void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width); -void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* d, - int dst_width); - -void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleCols_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleCols_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleColsUp2_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int, - int); -void ScaleColsUp2_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int, - int); -void ScaleFilterCols_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleFilterCols_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleFilterCols64_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x32, - int dx); -void ScaleFilterCols64_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x32, - int dx); -void ScaleRowDown38_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown38_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int dst_width); -void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_16_C(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width); -void ScaleARGBRowDown2_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEven_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBCols_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBCols64_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x32, - int dx); -void ScaleARGBColsUp2_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int, - int); -void ScaleARGBFilterCols_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols64_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x32, - int dx); -void ScaleUVRowDown2_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEven_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); - -void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVCols_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x, - int dx); -void ScaleUVCols64_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x32, - int dx); -void ScaleUVColsUp2_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int, - int); -void ScaleUVFilterCols_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x, - int dx); -void ScaleUVFilterCols64_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x32, - int dx); - -// Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); -void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); - -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); - -// ARGB Column functions -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleARGBCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); - -// ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); - -// UV Row functions -void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleUVRowDown2_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -// ScaleRowDown2Box also used by planar functions -// NEON downscalers with interpolation. - -// Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); - -void ScaleRowDown4_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -// 32x3 -> 12x1 -void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -// 32 -> 12 -void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -// 32x3 -> 12x1 -void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); - -void ScaleFilterCols_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); - -void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); - -void ScaleRowDown2_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown38_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleFilterCols_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleRowDown34_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown34_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width); -void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width); - -void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); -void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); - -void ScaleRowDown2_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width); -void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); -void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_16_MMI(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width); -void ScaleColsUp2_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); - -void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleAddRow_Any_MMI(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ diff --git a/thirdparty/libyuv/include/libyuv/scale_uv.h b/thirdparty/libyuv/include/libyuv/scale_uv.h deleted file mode 100644 index 8e74e31..0000000 --- a/thirdparty/libyuv/include/libyuv/scale_uv.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2020 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_UV_H_ -#define INCLUDE_LIBYUV_SCALE_UV_H_ - -#include "libyuv/basic_types.h" -#include "libyuv/scale.h" // For FilterMode - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -LIBYUV_API -int UVScale(const uint8_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint8_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Scale a 16 bit UV image. -// This function is currently incomplete, it can't handle all cases. -LIBYUV_API -int UVScale_16(const uint16_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint16_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_SCALE_UV_H_ diff --git a/thirdparty/libyuv/include/libyuv/version.h b/thirdparty/libyuv/include/libyuv/version.h deleted file mode 100644 index d720d48..0000000 --- a/thirdparty/libyuv/include/libyuv/version.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_VERSION_H_ -#define INCLUDE_LIBYUV_VERSION_H_ - -#define LIBYUV_VERSION 1787 - -#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/thirdparty/libyuv/include/libyuv/video_common.h b/thirdparty/libyuv/include/libyuv/video_common.h deleted file mode 100644 index 32b8a52..0000000 --- a/thirdparty/libyuv/include/libyuv/video_common.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Common definitions for video, including fourcc and VideoFormat. - -#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ -#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -////////////////////////////////////////////////////////////////////////////// -// Definition of FourCC codes -////////////////////////////////////////////////////////////////////////////// - -// Convert four characters to a FourCC code. -// Needs to be a macro otherwise the OS X compiler complains when the kFormat* -// constants are used in a switch. -#ifdef __cplusplus -#define FOURCC(a, b, c, d) \ - ((static_cast(a)) | (static_cast(b) << 8) | \ - (static_cast(c) << 16) | /* NOLINT */ \ - (static_cast(d) << 24)) /* NOLINT */ -#else -#define FOURCC(a, b, c, d) \ - (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ - ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ -#endif - -// Some pages discussing FourCC codes: -// http://www.fourcc.org/yuv.php -// http://v4l2spec.bytesex.org/spec/book1.htm -// http://developer.apple.com/quicktime/icefloe/dispatch020.html -// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 -// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt - -// FourCC codes grouped according to implementation efficiency. -// Primary formats should convert in 1 efficient step. -// Secondary formats are converted in 2 steps. -// Auxilliary formats call primary converters. -enum FourCC { - // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. - FOURCC_I420 = FOURCC('I', '4', '2', '0'), - FOURCC_I422 = FOURCC('I', '4', '2', '2'), - FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I400 = FOURCC('I', '4', '0', '0'), - FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), - FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), - FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), - FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), - FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 - FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422 - - // 1 Secondary YUV format: row biplanar. deprecated. - FOURCC_M420 = FOURCC('M', '4', '2', '0'), - - // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp - FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), - FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), - FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), - FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. - FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit - FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. - FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit - FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), - FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), - FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. - FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. - FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. - - // 1 Primary Compressed YUV format. - FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), - - // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. - FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), - FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), - FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), - FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. - FOURCC_J420 = - FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J422 = - FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J444 = - FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J400 = - FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_F420 = FOURCC('F', '4', '2', '0'), // bt.709 full, unofficial fourcc - FOURCC_F422 = FOURCC('F', '4', '2', '2'), // bt.709 full, unofficial fourcc - FOURCC_F444 = FOURCC('F', '4', '4', '4'), // bt.709 full, unofficial fourcc - FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc - FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc - FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc - FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc - FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc - FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc - FOURCC_F010 = FOURCC('F', '0', '1', '0'), // bt.709 full range 10 bit 420 - FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420 - FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420 - FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422 - FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422 - FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422 - FOURCC_P010 = FOURCC('P', '0', '1', '0'), - FOURCC_P210 = FOURCC('P', '2', '1', '0'), - - // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. - FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. - FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. - FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. - FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. - FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. - FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. - FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. - FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. - FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. - FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. - FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. - FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. - FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB - FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB - FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. - FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. - FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. - - // deprecated formats. Not supported, but defined for backward compatibility. - FOURCC_I411 = FOURCC('I', '4', '1', '1'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), - FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), - FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), - FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), - FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), - FOURCC_H264 = FOURCC('H', '2', '6', '4'), - - // Match any fourcc. - FOURCC_ANY = -1, -}; - -enum FourCCBpp { - // Canonical fourcc codes used in our code. - FOURCC_BPP_I420 = 12, - FOURCC_BPP_I422 = 16, - FOURCC_BPP_I444 = 24, - FOURCC_BPP_I411 = 12, - FOURCC_BPP_I400 = 8, - FOURCC_BPP_NV21 = 12, - FOURCC_BPP_NV12 = 12, - FOURCC_BPP_YUY2 = 16, - FOURCC_BPP_UYVY = 16, - FOURCC_BPP_M420 = 12, // deprecated - FOURCC_BPP_Q420 = 12, - FOURCC_BPP_ARGB = 32, - FOURCC_BPP_BGRA = 32, - FOURCC_BPP_ABGR = 32, - FOURCC_BPP_RGBA = 32, - FOURCC_BPP_AR30 = 32, - FOURCC_BPP_AB30 = 32, - FOURCC_BPP_AR64 = 64, - FOURCC_BPP_AB64 = 64, - FOURCC_BPP_24BG = 24, - FOURCC_BPP_RAW = 24, - FOURCC_BPP_RGBP = 16, - FOURCC_BPP_RGBO = 16, - FOURCC_BPP_R444 = 16, - FOURCC_BPP_RGGB = 8, - FOURCC_BPP_BGGR = 8, - FOURCC_BPP_GRBG = 8, - FOURCC_BPP_GBRG = 8, - FOURCC_BPP_YV12 = 12, - FOURCC_BPP_YV16 = 16, - FOURCC_BPP_YV24 = 24, - FOURCC_BPP_YU12 = 12, - FOURCC_BPP_J420 = 12, - FOURCC_BPP_J400 = 8, - FOURCC_BPP_H420 = 12, - FOURCC_BPP_H422 = 16, - FOURCC_BPP_I010 = 15, - FOURCC_BPP_I210 = 20, - FOURCC_BPP_H010 = 15, - FOURCC_BPP_H210 = 20, - FOURCC_BPP_P010 = 15, - FOURCC_BPP_P210 = 20, - FOURCC_BPP_MJPG = 0, // 0 means unknown. - FOURCC_BPP_H264 = 0, - FOURCC_BPP_IYUV = 12, - FOURCC_BPP_YU16 = 16, - FOURCC_BPP_YU24 = 24, - FOURCC_BPP_YUYV = 16, - FOURCC_BPP_YUVS = 16, - FOURCC_BPP_HDYC = 16, - FOURCC_BPP_2VUY = 16, - FOURCC_BPP_JPEG = 1, - FOURCC_BPP_DMB1 = 1, - FOURCC_BPP_BA81 = 8, - FOURCC_BPP_RGB3 = 24, - FOURCC_BPP_BGR3 = 24, - FOURCC_BPP_CM32 = 32, - FOURCC_BPP_CM24 = 24, - - // Match any fourcc. - FOURCC_BPP_ANY = 0, // 0 means unknown. -}; - -// Converts fourcc aliases into canonical ones. -LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ diff --git a/thirdparty/libyuv/libyuv.gni b/thirdparty/libyuv/libyuv.gni deleted file mode 100644 index 8df40ba..0000000 --- a/thirdparty/libyuv/libyuv.gni +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2016 The LibYuv Project Authors. All rights reserved. -# -# Use of this source code is governed by a BSD-style license -# that can be found in the LICENSE file in the root of the source -# tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may -# be found in the AUTHORS file in the root of the source tree. - -import("//build_overrides/build.gni") -import("//build/config/arm.gni") -import("//build/config/mips.gni") - -declare_args() { - libyuv_include_tests = !build_with_chromium - libyuv_disable_jpeg = false - libyuv_use_neon = - current_cpu == "arm64" || - (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)) - libyuv_use_msa = - (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa - libyuv_use_mmi = - (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi -} diff --git a/thirdparty/libyuv/linux.mk b/thirdparty/libyuv/linux.mk deleted file mode 100644 index f5e73ea..0000000 --- a/thirdparty/libyuv/linux.mk +++ /dev/null @@ -1,97 +0,0 @@ -# This is a generic makefile for libyuv for gcc. -# make -f linux.mk CXX=clang++ - -CC?=gcc -CFLAGS?=-O2 -fomit-frame-pointer -CFLAGS+=-Iinclude/ - -CXX?=g++ -CXXFLAGS?=-O2 -fomit-frame-pointer -CXXFLAGS+=-Iinclude/ - -LOCAL_OBJ_FILES := \ - source/compare.o \ - source/compare_common.o \ - source/compare_gcc.o \ - source/compare_mmi.o \ - source/compare_msa.o \ - source/compare_neon.o \ - source/compare_neon64.o \ - source/compare_win.o \ - source/convert.o \ - source/convert_argb.o \ - source/convert_from.o \ - source/convert_from_argb.o \ - source/convert_jpeg.o \ - source/convert_to_argb.o \ - source/convert_to_i420.o \ - source/cpu_id.o \ - source/mjpeg_decoder.o \ - source/mjpeg_validate.o \ - source/planar_functions.o \ - source/rotate.o \ - source/rotate_any.o \ - source/rotate_argb.o \ - source/rotate_common.o \ - source/rotate_gcc.o \ - source/rotate_mmi.o \ - source/rotate_msa.o \ - source/rotate_neon.o \ - source/rotate_neon64.o \ - source/rotate_win.o \ - source/row_any.o \ - source/row_common.o \ - source/row_gcc.o \ - source/row_mmi.o \ - source/row_msa.o \ - source/row_neon.o \ - source/row_neon64.o \ - source/row_win.o \ - source/scale.o \ - source/scale_any.o \ - source/scale_argb.o \ - source/scale_common.o \ - source/scale_gcc.o \ - source/scale_mmi.o \ - source/scale_msa.o \ - source/scale_neon.o \ - source/scale_neon64.o \ - source/scale_uv.o \ - source/scale_win.o \ - source/video_common.o - -.cc.o: - $(CXX) -c $(CXXFLAGS) $*.cc -o $*.o - -.c.o: - $(CC) -c $(CFLAGS) $*.c -o $*.o - -all: libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr - -libyuv.a: $(LOCAL_OBJ_FILES) - $(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES) - -# A C++ test utility that uses libyuv conversion. -yuvconvert: util/yuvconvert.cc libyuv.a - $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a - -# A C test utility that generates yuvconstants for yuv to rgb. -yuvconstants: util/yuvconstants.c libyuv.a - $(CXX) $(CXXFLAGS) -Iutil/ -lm -o $@ util/yuvconstants.c libyuv.a - -# A standalone test utility -psnr: util/psnr.cc - $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc - -# A simple conversion example. -i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a - $(CXX) $(CXXFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a - -# A C test utility that uses libyuv conversion from C. -# gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0 -# CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk -cpuid: util/cpuid.c libyuv.a - $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a - -clean: - /bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr diff --git a/thirdparty/libyuv/public.mk b/thirdparty/libyuv/public.mk deleted file mode 100644 index 1342307..0000000 --- a/thirdparty/libyuv/public.mk +++ /dev/null @@ -1,13 +0,0 @@ -# This file contains all the common make variables which are useful for -# anyone depending on this library. -# Note that dependencies on NDK are not directly listed since NDK auto adds -# them. - -LIBYUV_INCLUDES := $(LIBYUV_PATH)/include - -LIBYUV_C_FLAGS := - -LIBYUV_CPP_FLAGS := - -LIBYUV_LDLIBS := -LIBYUV_DEP_MODULES := diff --git a/thirdparty/libyuv/pylintrc b/thirdparty/libyuv/pylintrc deleted file mode 100644 index b8bea33..0000000 --- a/thirdparty/libyuv/pylintrc +++ /dev/null @@ -1,17 +0,0 @@ -[MESSAGES CONTROL] - -# Disable the message, report, category or checker with the given id(s). -# TODO(kjellander): Reduce this list to as small as possible. -disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements - - -[REPORTS] - -# Don't write out full reports, just messages. -reports=no - - -[FORMAT] - -# We use two spaces for indents, instead of the usual four spaces or tab. -indent-string=' ' diff --git a/thirdparty/libyuv/source/compare.cc b/thirdparty/libyuv/source/compare.cc deleted file mode 100644 index e93aba1..0000000 --- a/thirdparty/libyuv/source/compare.cc +++ /dev/null @@ -1,440 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/compare.h" - -#include -#include -#ifdef _OPENMP -#include -#endif - -#include "libyuv/basic_types.h" -#include "libyuv/compare_row.h" -#include "libyuv/cpu_id.h" -#include "libyuv/row.h" -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// hash seed of 5381 recommended. -LIBYUV_API -uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { - const int kBlockSize = 1 << 15; // 32768; - int remainder; - uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = - HashDjb2_C; -#if defined(HAS_HASHDJB2_SSE41) - if (TestCpuFlag(kCpuHasSSE41)) { - HashDjb2_SSE = HashDjb2_SSE41; - } -#endif -#if defined(HAS_HASHDJB2_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - HashDjb2_SSE = HashDjb2_AVX2; - } -#endif - - while (count >= (uint64_t)(kBlockSize)) { - seed = HashDjb2_SSE(src, kBlockSize, seed); - src += kBlockSize; - count -= kBlockSize; - } - remainder = (int)count & ~15; - if (remainder) { - seed = HashDjb2_SSE(src, remainder, seed); - src += remainder; - count -= remainder; - } - remainder = (int)count & 15; - if (remainder) { - seed = HashDjb2_C(src, remainder, seed); - } - return seed; -} - -static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. - return FOURCC_BGRA; - } - if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA. - return FOURCC_ARGB; - } - if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. - return FOURCC_BGRA; - } - if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255. - return FOURCC_ARGB; - } - argb += 8; - } - if (width & 1) { - if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. - return FOURCC_BGRA; - } - if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. - return FOURCC_ARGB; - } - } - return 0; -} - -// Scan an opaque argb image and return fourcc based on alpha offset. -// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. -LIBYUV_API -uint32_t ARGBDetect(const uint8_t* argb, - int stride_argb, - int width, - int height) { - uint32_t fourcc = 0; - int h; - - // Coalesce rows. - if (stride_argb == width * 4) { - width *= height; - height = 1; - stride_argb = 0; - } - for (h = 0; h < height && fourcc == 0; ++h) { - fourcc = ARGBDetectRow_C(argb, width); - argb += stride_argb; - } - return fourcc; -} - -// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. -// So actual maximum is 1 less loop, which is 64436 - 32 bytes. - -LIBYUV_API -uint64_t ComputeHammingDistance(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - const int kBlockSize = 1 << 15; // 32768; - const int kSimdSize = 64; - // SIMD for multiple of 64, and C for remainder - int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); - uint64_t diff = 0; - int i; - uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, - int count) = HammingDistance_C; -#if defined(HAS_HAMMINGDISTANCE_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - HammingDistance = HammingDistance_NEON; - } -#endif -#if defined(HAS_HAMMINGDISTANCE_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - HammingDistance = HammingDistance_SSSE3; - } -#endif -#if defined(HAS_HAMMINGDISTANCE_SSE42) - if (TestCpuFlag(kCpuHasSSE42)) { - HammingDistance = HammingDistance_SSE42; - } -#endif -#if defined(HAS_HAMMINGDISTANCE_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - HammingDistance = HammingDistance_AVX2; - } -#endif -#if defined(HAS_HAMMINGDISTANCE_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - HammingDistance = HammingDistance_MMI; - } -#endif -#if defined(HAS_HAMMINGDISTANCE_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - HammingDistance = HammingDistance_MSA; - } -#endif - -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : diff) -#endif - for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { - diff += HammingDistance(src_a + i, src_b + i, kBlockSize); - } - src_a += count & ~(kBlockSize - 1); - src_b += count & ~(kBlockSize - 1); - if (remainder) { - diff += HammingDistance(src_a, src_b, remainder); - src_a += remainder; - src_b += remainder; - } - remainder = count & (kSimdSize - 1); - if (remainder) { - diff += HammingDistance_C(src_a, src_b, remainder); - } - return diff; -} - -// TODO(fbarchard): Refactor into row function. -LIBYUV_API -uint64_t ComputeSumSquareError(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - // SumSquareError returns values 0 to 65535 for each squared difference. - // Up to 65536 of those can be summed and remain within a uint32_t. - // After each block of 65536 pixels, accumulate into a uint64_t. - const int kBlockSize = 65536; - int remainder = count & (kBlockSize - 1) & ~31; - uint64_t sse = 0; - int i; - uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, - int count) = SumSquareError_C; -#if defined(HAS_SUMSQUAREERROR_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SumSquareError = SumSquareError_NEON; - } -#endif -#if defined(HAS_SUMSQUAREERROR_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - // Note only used for multiples of 16 so count is not checked. - SumSquareError = SumSquareError_SSE2; - } -#endif -#if defined(HAS_SUMSQUAREERROR_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - // Note only used for multiples of 32 so count is not checked. - SumSquareError = SumSquareError_AVX2; - } -#endif -#if defined(HAS_SUMSQUAREERROR_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SumSquareError = SumSquareError_MMI; - } -#endif -#if defined(HAS_SUMSQUAREERROR_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SumSquareError = SumSquareError_MSA; - } -#endif -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : sse) -#endif - for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { - sse += SumSquareError(src_a + i, src_b + i, kBlockSize); - } - src_a += count & ~(kBlockSize - 1); - src_b += count & ~(kBlockSize - 1); - if (remainder) { - sse += SumSquareError(src_a, src_b, remainder); - src_a += remainder; - src_b += remainder; - } - remainder = count & 31; - if (remainder) { - sse += SumSquareError_C(src_a, src_b, remainder); - } - return sse; -} - -LIBYUV_API -uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height) { - uint64_t sse = 0; - int h; - // Coalesce rows. - if (stride_a == width && stride_b == width) { - width *= height; - height = 1; - stride_a = stride_b = 0; - } - for (h = 0; h < height; ++h) { - sse += ComputeSumSquareError(src_a, src_b, width); - src_a += stride_a; - src_b += stride_b; - } - return sse; -} - -LIBYUV_API -double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { - double psnr; - if (sse > 0) { - double mse = (double)count / (double)sse; - psnr = 10.0 * log10(255.0 * 255.0 * mse); - } else { - psnr = kMaxPsnr; // Limit to prevent divide by 0 - } - - if (psnr > kMaxPsnr) { - psnr = kMaxPsnr; - } - - return psnr; -} - -LIBYUV_API -double CalcFramePsnr(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height) { - const uint64_t samples = (uint64_t)width * (uint64_t)height; - const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, - stride_b, width, height); - return SumSquareErrorToPsnr(sse, samples); -} - -LIBYUV_API -double I420Psnr(const uint8_t* src_y_a, - int stride_y_a, - const uint8_t* src_u_a, - int stride_u_a, - const uint8_t* src_v_a, - int stride_v_a, - const uint8_t* src_y_b, - int stride_y_b, - const uint8_t* src_u_b, - int stride_u_b, - const uint8_t* src_v_b, - int stride_v_b, - int width, - int height) { - const uint64_t sse_y = ComputeSumSquareErrorPlane( - src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); - const int width_uv = (width + 1) >> 1; - const int height_uv = (height + 1) >> 1; - const uint64_t sse_u = ComputeSumSquareErrorPlane( - src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); - const uint64_t sse_v = ComputeSumSquareErrorPlane( - src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); - const uint64_t samples = (uint64_t)width * (uint64_t)height + - 2 * ((uint64_t)width_uv * (uint64_t)height_uv); - const uint64_t sse = sse_y + sse_u + sse_v; - return SumSquareErrorToPsnr(sse, samples); -} - -static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 -static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 - -static double Ssim8x8_C(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b) { - int64_t sum_a = 0; - int64_t sum_b = 0; - int64_t sum_sq_a = 0; - int64_t sum_sq_b = 0; - int64_t sum_axb = 0; - - int i; - for (i = 0; i < 8; ++i) { - int j; - for (j = 0; j < 8; ++j) { - sum_a += src_a[j]; - sum_b += src_b[j]; - sum_sq_a += src_a[j] * src_a[j]; - sum_sq_b += src_b[j] * src_b[j]; - sum_axb += src_a[j] * src_b[j]; - } - - src_a += stride_a; - src_b += stride_b; - } - - { - const int64_t count = 64; - // scale the constants by number of pixels - const int64_t c1 = (cc1 * count * count) >> 12; - const int64_t c2 = (cc2 * count * count) >> 12; - - const int64_t sum_a_x_sum_b = sum_a * sum_b; - - const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * - (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); - - const int64_t sum_a_sq = sum_a * sum_a; - const int64_t sum_b_sq = sum_b * sum_b; - - const int64_t ssim_d = - (sum_a_sq + sum_b_sq + c1) * - (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); - - if (ssim_d == 0.0) { - return DBL_MAX; - } - return ssim_n * 1.0 / ssim_d; - } -} - -// We are using a 8x8 moving window with starting location of each 8x8 window -// on the 4x4 pixel grid. Such arrangement allows the windows to overlap -// block boundaries to penalize blocking artifacts. -LIBYUV_API -double CalcFrameSsim(const uint8_t* src_a, - int stride_a, - const uint8_t* src_b, - int stride_b, - int width, - int height) { - int samples = 0; - double ssim_total = 0; - double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, - int stride_b) = Ssim8x8_C; - - // sample point start with each 4x4 location - int i; - for (i = 0; i < height - 8; i += 4) { - int j; - for (j = 0; j < width - 8; j += 4) { - ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); - samples++; - } - - src_a += stride_a * 4; - src_b += stride_b * 4; - } - - ssim_total /= samples; - return ssim_total; -} - -LIBYUV_API -double I420Ssim(const uint8_t* src_y_a, - int stride_y_a, - const uint8_t* src_u_a, - int stride_u_a, - const uint8_t* src_v_a, - int stride_v_a, - const uint8_t* src_y_b, - int stride_y_b, - const uint8_t* src_u_b, - int stride_u_b, - const uint8_t* src_v_b, - int stride_v_b, - int width, - int height) { - const double ssim_y = - CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); - const int width_uv = (width + 1) >> 1; - const int height_uv = (height + 1) >> 1; - const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, - width_uv, height_uv); - const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, - width_uv, height_uv); - return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_common.cc b/thirdparty/libyuv/source/compare_common.cc deleted file mode 100644 index d1cab8d..0000000 --- a/thirdparty/libyuv/source/compare_common.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Hakmem method for hamming distance. -uint32_t HammingDistance_C(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); - uint32_t u = x - ((x >> 1) & 0x55555555); - u = ((u >> 2) & 0x33333333) + (u & 0x33333333); - diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); - src_a += 4; - src_b += 4; - } - - for (; i < count; ++i) { - uint32_t x = *src_a ^ *src_b; - uint32_t u = x - ((x >> 1) & 0x55); - u = ((u >> 2) & 0x33) + (u & 0x33); - diff += (u + (u >> 4)) & 0x0f; - src_a += 1; - src_b += 1; - } - - return diff; -} - -uint32_t SumSquareError_C(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - int i; - for (i = 0; i < count; ++i) { - int diff = src_a[i] - src_b[i]; - sse += (uint32_t)(diff * diff); - } - return sse; -} - -// hash seed of 5381 recommended. -// Internal C version of HashDjb2 with int sized count for efficiency. -uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { - uint32_t hash = seed; - int i; - for (i = 0; i < count; ++i) { - hash += (hash << 5) + src[i]; - } - return hash; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_gcc.cc b/thirdparty/libyuv/source/compare_gcc.cc deleted file mode 100644 index 7dcbf7d..0000000 --- a/thirdparty/libyuv/source/compare_gcc.cc +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) - -#if defined(__x86_64__) -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint64_t diff = 0u; - - asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" - - // Process 32 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(diff) // %3 - : - : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); - - return static_cast(diff); -} -#else -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - asm volatile( - // Process 16 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "+r"(diff) // %3 - : - : "memory", "cc", "ecx", "edx"); - - return diff; -} -#endif - -static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15}; -static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; - -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(diff) // %3 - : "m"(kNibbleMask), // %4 - "m"(kBitCount) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - - return diff; -} - -#ifdef HAS_HAMMINGDISTANCE_AVX2 -uint32_t HammingDistance_AVX2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - asm volatile( - "vbroadcastf128 %4,%%ymm2 \n" - "vbroadcastf128 %5,%%ymm3 \n" - "vpxor %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm1,%%ymm1,%%ymm1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "vmovdqa (%0),%%ymm4 \n" - "vmovdqa 0x20(%0), %%ymm5 \n" - "vpxor (%0,%1), %%ymm4, %%ymm4 \n" - "vpand %%ymm2,%%ymm4,%%ymm6 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" - "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" - "add $0x40,%0 \n" - "vpand %%ymm2,%%ymm4,%%ymm5 \n" - "vpsrlw $0x4,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" - "vpand %%ymm2,%%ymm4,%%ymm4 \n" - "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" - "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" - "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" - "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - - "vpermq $0xb1,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xaa,%%ymm0,%%ymm1 \n" - "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vmovd %%xmm0, %3 \n" - "vzeroupper \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(diff) // %3 - : "m"(kNibbleMask), // %4 - "m"(kBitCount) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); - - return diff; -} -#endif // HAS_HAMMINGDISTANCE_AVX2 - -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); - return sse; -} - -static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 -static const uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 -}; -static const uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 -}; -static const uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 -}; -static const uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 -}; - -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - uint32_t hash; - asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=g"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - return hash; -} -#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_mmi.cc b/thirdparty/libyuv/source/compare_mmi.cc deleted file mode 100644 index 7640d94..0000000 --- a/thirdparty/libyuv/source/compare_mmi.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// Hakmem method for hamming distance. -uint32_t HammingDistance_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; - uint64_t c1 = 0x5555555555555555; - uint64_t c2 = 0x3333333333333333; - uint64_t c3 = 0x0f0f0f0f0f0f0f0f; - uint32_t c4 = 0x01010101; - uint64_t s1 = 1, s2 = 2, s3 = 4; - __asm__ volatile( - "1: \n\t" - "ldc1 %[ta], 0(%[src_a]) \n\t" - "ldc1 %[tb], 0(%[src_b]) \n\t" - "xor %[temp], %[ta], %[tb] \n\t" - "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 - "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 - "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 - "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) - "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 - "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 - "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t - "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 - "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) - "and %[temp1], %[temp1], %[c3] \n\t" //&c3 - "dmfc1 $t0, %[temp1] \n\t" - "dsrl32 $t0, $t0, 0 \n\t " - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "dmfc1 $t0, %[temp1] \n\t" - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "daddiu %[src_a], %[src_a], 8 \n\t" - "daddiu %[src_b], %[src_b], 8 \n\t" - "addiu %[count], %[count], -8 \n\t" - "bgtz %[count], 1b \n\t" - "nop \n\t" - : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), - [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), - [temp1] "+f"(temp1) - : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), - [s2] "f"(s2), [s3] "f"(s3) - : "memory"); - return diff; -} - -uint32_t SumSquareError_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - uint32_t sse_hi = 0u, sse_lo = 0u; - - uint64_t src1, src2; - uint64_t diff, diff_hi, diff_lo; - uint64_t sse_sum, sse_tmp; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" - - "1: \n\t" - "ldc1 %[src1], 0x00(%[src_a]) \n\t" - "ldc1 %[src2], 0x00(%[src_b]) \n\t" - "pasubub %[diff], %[src1], %[src2] \n\t" - "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" - "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" - "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - - "daddiu %[src_a], %[src_a], 0x08 \n\t" - "daddiu %[src_b], %[src_b], 0x08 \n\t" - "daddiu %[count], %[count], -0x08 \n\t" - "bnez %[count], 1b \n\t" - - "mfc1 %[sse_lo], %[sse_sum] \n\t" - "mfhc1 %[sse_hi], %[sse_sum] \n\t" - "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" - : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), - [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), - [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), - [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) - : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), - [mask] "f"(mask) - : "memory"); - - return sse; -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_msa.cc b/thirdparty/libyuv/source/compare_msa.cc deleted file mode 100644 index 0b807d3..0000000 --- a/thirdparty/libyuv/source/compare_msa.cc +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2017 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "libyuv/macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -uint32_t HammingDistance_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - int i; - v16u8 src0, src1, src2, src3; - v2i64 vec0 = {0}, vec1 = {0}; - - for (i = 0; i < count; i += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); - src0 ^= src2; - src1 ^= src3; - vec0 += __msa_pcnt_d((v2i64)src0); - vec1 += __msa_pcnt_d((v2i64)src1); - src_a += 32; - src_b += 32; - } - - vec0 += vec1; - diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); - diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); - return diff; -} - -uint32_t SumSquareError_MSA(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - int i; - v16u8 src0, src1, src2, src3; - v8i16 vec0, vec1, vec2, vec3; - v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; - v2i64 tmp0; - - for (i = 0; i < count; i += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); - vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); - reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); - reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); - reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); - reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); - src_a += 32; - src_b += 32; - } - - reg0 += reg1; - reg2 += reg3; - reg0 += reg2; - tmp0 = __msa_hadd_s_d(reg0, reg0); - sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); - sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); - return sse; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/thirdparty/libyuv/source/compare_neon.cc b/thirdparty/libyuv/source/compare_neon.cc deleted file mode 100644 index afdd601..0000000 --- a/thirdparty/libyuv/source/compare_neon.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// 256 bits at a time -// uses short accumulator which restricts count to 131 KB -uint32_t HammingDistance_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - - asm volatile( - "vmov.u16 q4, #0 \n" // accumulator - - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" - - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" - - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) - : - : "cc", "q0", "q1", "q2", "q3", "q4"); - return diff; -} - -uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" - - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" - - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); - return sse; -} - -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_neon64.cc b/thirdparty/libyuv/source/compare_neon64.cc deleted file mode 100644 index 70fb9b9..0000000 --- a/thirdparty/libyuv/source/compare_neon64.cc +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// 256 bits at a time -// uses short accumulator which restricts count to 131 KB -uint32_t HammingDistance_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - asm volatile( - "movi v4.8h, #0 \n" - - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" - - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) - : - : "cc", "v0", "v1", "v2", "v3", "v4"); - return diff; -} - -uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" - - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" - - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); - return sse; -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/compare_win.cc b/thirdparty/libyuv/source/compare_win.cc deleted file mode 100644 index 9bb27f1..0000000 --- a/thirdparty/libyuv/source/compare_win.cc +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" -#include "libyuv/row.h" - -#if defined(_MSC_VER) -#include // For __popcnt -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT - src_a += 4; - src_b += 4; - diff += __popcnt(x); - } - return diff; -} - -__declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - - wloop: - movdqu xmm1, [eax] - lea eax, [eax + 16] - movdqu xmm2, [edx] - lea edx, [edx + 16] - movdqa xmm3, xmm1 // abs trick - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - jg wloop - - pshufd xmm1, xmm0, 0xee - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 0x01 - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} - -#ifdef HAS_SUMSQUAREERROR_AVX2 -// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable : 4752) -__declspec(naked) uint32_t - SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - vpxor ymm0, ymm0, ymm0 // sum - vpxor ymm5, ymm5, ymm5 // constant 0 for unpck - sub edx, eax - - wloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + edx] - lea eax, [eax + 32] - vpsubusb ymm3, ymm1, ymm2 // abs difference trick - vpsubusb ymm2, ymm2, ymm1 - vpor ymm1, ymm2, ymm3 - vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. - vpunpckhbw ymm1, ymm1, ymm5 - vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. - vpmaddwd ymm1, ymm1, ymm1 - vpaddd ymm0, ymm0, ymm1 - vpaddd ymm0, ymm0, ymm2 - sub ecx, 32 - jg wloop - - vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. - vpaddd ymm0, ymm0, ymm1 - vpermq ymm1, ymm0, 0x02 // high + low lane. - vpaddd ymm0, ymm0, ymm1 - vmovd eax, xmm0 - vzeroupper - ret - } -} -#endif // HAS_SUMSQUAREERROR_AVX2 - -uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 -uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 -}; -uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 -}; -uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 -}; -uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 -}; - -__declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, xmmword ptr kHash16x33 - - wloop: - movdqu xmm1, [eax] // src[0-15] - lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, xmmword ptr kHashMul0 - movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] - movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] - pmulld xmm3, xmm5 - movdqa xmm5, xmmword ptr kHashMul1 - movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] - pmulld xmm4, xmm5 - movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] - movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] - pmulld xmm2, xmm5 - movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] - pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - sub ecx, 16 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} - -// Visual C 2012 required for AVX2. -#ifdef HAS_HASHDJB2_AVX2 -__declspec(naked) uint32_t - HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - vmovd xmm0, [esp + 12] // seed - - wloop: - vpmovzxbd xmm3, [eax] // src[0-3] - vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 - vpmovzxbd xmm4, [eax + 4] // src[4-7] - vpmulld xmm3, xmm3, xmmword ptr kHashMul0 - vpmovzxbd xmm2, [eax + 8] // src[8-11] - vpmulld xmm4, xmm4, xmmword ptr kHashMul1 - vpmovzxbd xmm1, [eax + 12] // src[12-15] - vpmulld xmm2, xmm2, xmmword ptr kHashMul2 - lea eax, [eax + 16] - vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm1, xmm1, xmm3 - vpshufd xmm2, xmm1, 0x0e // upper 2 dwords - vpaddd xmm1, xmm1,xmm2 - vpshufd xmm2, xmm1, 0x01 - vpaddd xmm1, xmm1, xmm2 - vpaddd xmm0, xmm0, xmm1 - sub ecx, 16 - jg wloop - - vmovd eax, xmm0 // return hash - vzeroupper - ret - } -} -#endif // HAS_HASHDJB2_AVX2 - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert.cc b/thirdparty/libyuv/source/convert.cc deleted file mode 100644 index 69f7fb6..0000000 --- a/thirdparty/libyuv/source/convert.cc +++ /dev/null @@ -1,3148 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert.h" - -#include "libyuv/basic_types.h" -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate.h" -#include "libyuv/row.h" -#include "libyuv/scale.h" // For ScalePlane() -#include "libyuv/scale_uv.h" // For UVScale() - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// Any I4xx To I420 format with mirroring. -static int I4xxToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int src_y_width, - int src_y_height, - int src_uv_width, - int src_uv_height) { - const int dst_y_width = Abs(src_y_width); - const int dst_y_height = Abs(src_y_height); - const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); - const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); - if (src_uv_width <= 0 || src_uv_height == 0) { - return -1; - } - if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, - dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); - } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, - dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, - dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); - return 0; -} - -// Copy I420 with optional flipping. -// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure -// is does row coalescing. -LIBYUV_API -int I420Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - // Copy UV planes. - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; -} - -// Copy I010 with optional flipping. -LIBYUV_API -int I010Copy(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - // Copy UV planes. - CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; -} - -static int Planar16bitTo8bit(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - int subsample_x, - int subsample_y, - int depth) { - int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); - int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); - int scale = 1 << (24 - depth); - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - uv_height = -uv_height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (uv_height - 1) * src_stride_u; - src_v = src_v + (uv_height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - // Convert Y plane. - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, - height); - // Convert UV planes. - Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width, - uv_height); - Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width, - uv_height); - return 0; -} - -// Convert 10 bit YUV to 8 bit. -LIBYUV_API -int I010ToI420(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 1, - 1, 10); -} - -LIBYUV_API -int I210ToI422(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 1, - 0, 10); -} - -LIBYUV_API -int I410ToI444(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 0, - 0, 10); -} - -LIBYUV_API -int I012ToI420(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 1, - 1, 12); -} - -LIBYUV_API -int I212ToI422(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 1, - 0, 12); -} - -LIBYUV_API -int I412ToI444(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, width, height, 0, - 0, 12); -} - -// Any Ix10 To I010 format with mirroring. -static int Ix10ToI010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height, - int subsample_x, - int subsample_y) { - const int dst_y_width = Abs(width); - const int dst_y_height = Abs(height); - const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x); - const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y); - const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); - const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); - if (width <= 0 || height == 0) { - return -1; - } - if (dst_y) { - ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - dst_y_width, dst_y_height, kFilterBilinear); - } - ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, - dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); - ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, - dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); - return 0; -} - -LIBYUV_API -int I410ToI010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, 0, 0); -} - -LIBYUV_API -int I210ToI010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, 1, 0); -} - -// Any I[420]1[02] to P[420]1[02] format with mirroring. -static int IxxxToPxxx(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height, - int subsample_x, - int subsample_y, - int depth) { - const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); - const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); - if (width <= 0 || height == 0) { - return -1; - } - - ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, - depth); - MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv, - dst_stride_uv, uv_width, uv_height, depth); - return 0; -} - -LIBYUV_API -int I010ToP010(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, - width, height, 1, 1, 10); -} - -LIBYUV_API -int I210ToP210(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, - width, height, 1, 0, 10); -} - -LIBYUV_API -int I012ToP012(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, - width, height, 1, 1, 12); -} - -LIBYUV_API -int I212ToP212(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, - width, height, 1, 0, 12); -} - -// 422 chroma is 1/2 width, 1x height -// 420 chroma is 1/2 width, 1/2 height -LIBYUV_API -int I422ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - const int src_uv_width = SUBSAMPLE(width, 1, 1); - return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, src_uv_width, height); -} - -// TODO(fbarchard): Implement row conversion. -LIBYUV_API -int I422ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - // Allocate u and v buffers - align_buffer_64(plane_u, halfwidth * halfheight * 2); - uint8_t* plane_v = plane_u + halfwidth * halfheight; - - I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, - height); - MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, - halfwidth, halfheight); - free_aligned_buffer_64(plane_u); - return 0; -} - -#ifdef I422TONV21_ROW_VERSION -// Unittest fails for this version. -// 422 chroma is 1/2 width, 1x height -// 420 chroma is 1/2 width, 1/2 height -// Swap src_u and src_v to implement I422ToNV12 -LIBYUV_API -int I422ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int y; - void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_uv, int width) = MergeUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_MSA; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); - } - { - // Allocate 2 rows of vu. - int awidth = halfwidth * 2; - align_buffer_64(row_vu_0, awidth * 2); - uint8_t* row_vu_1 = row_vu_0 + awidth; - - for (y = 0; y < height - 1; y += 2) { - MergeUVRow(src_v, src_u, row_vu_0, halfwidth); - MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1, - halfwidth); - InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128); - src_u += src_stride_u * 2; - src_v += src_stride_v * 2; - dst_vu += dst_stride_vu; - } - if (height & 1) { - MergeUVRow(src_v, src_u, dst_vu, halfwidth); - } - free_aligned_buffer_64(row_vu_0); - } - return 0; -} -#endif // I422TONV21_ROW_VERSION - -// 444 chroma is 1x width, 1x height -// 420 chroma is 1/2 width, 1/2 height -LIBYUV_API -int I444ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, width, height); -} - -LIBYUV_API -int I444ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, - dst_stride_uv, width, height); - return 0; -} - -LIBYUV_API -int I444ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, - width, height); -} - -// I400 is greyscale typically used in MJPG -LIBYUV_API -int I400ToI420(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); - SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); - return 0; -} - -// I400 is greyscale typically used in MJPG -LIBYUV_API -int I400ToNV21(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!dst_vu || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128); - return 0; -} - -// Convert NV12 to I420. -// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm. -LIBYUV_API -int NV12ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_uv = src_uv + (halfheight - 1) * src_stride_uv; - src_stride_y = -src_stride_y; - src_stride_uv = -src_stride_uv; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - // Coalesce rows. - if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && - dst_stride_v == halfwidth) { - halfwidth *= halfheight; - halfheight = 1; - src_stride_uv = dst_stride_u = dst_stride_v = 0; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // Split UV plane - NV12 / NV21 - SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, - halfwidth, halfheight); - - return 0; -} - -// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. -LIBYUV_API -int NV21ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, - dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, - width, height); -} - -LIBYUV_API -int NV12ToNV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (width <= 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), - SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), - Abs(height), kFilterBilinear); - return 0; -} - -LIBYUV_API -int NV16ToNV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (width <= 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, - dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); - return 0; -} - -LIBYUV_API -int P010ToP410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (width <= 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), - SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), - Abs(height), kFilterBilinear); - return 0; -} - -LIBYUV_API -int P210ToP410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (width <= 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, - dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); - return 0; -} - -// Convert YUY2 to I420. -LIBYUV_API -int YUY2ToI420(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, - uint8_t* dst_u, uint8_t* dst_v, int width) = - YUY2ToUVRow_C; - void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = - YUY2ToYRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; - } -#if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; - YUY2ToYRow = YUY2ToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - YUY2ToUVRow = YUY2ToUVRow_AVX2; - YUY2ToYRow = YUY2ToYRow_AVX2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - YUY2ToUVRow = YUY2ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_NEON; - YUY2ToUVRow = YUY2ToUVRow_NEON; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - YUY2ToUVRow = YUY2ToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToYRow = YUY2ToYRow_Any_MSA; - YUY2ToUVRow = YUY2ToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_MSA; - YUY2ToUVRow = YUY2ToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); - YUY2ToYRow(src_yuy2, dst_y, width); - YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); - src_yuy2 += src_stride_yuy2 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); - YUY2ToYRow(src_yuy2, dst_y, width); - } - return 0; -} - -// Convert UYVY to I420. -LIBYUV_API -int UYVYToI420(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, - uint8_t* dst_u, uint8_t* dst_v, int width) = - UYVYToUVRow_C; - void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = - UYVYToYRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; - src_stride_uyvy = -src_stride_uyvy; - } -#if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUVRow = UYVYToUVRow_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; - } - } -#endif -#if defined(HAS_UYVYTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - UYVYToUVRow = UYVYToUVRow_Any_AVX2; - UYVYToYRow = UYVYToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - UYVYToUVRow = UYVYToUVRow_AVX2; - UYVYToYRow = UYVYToYRow_AVX2; - } - } -#endif -#if defined(HAS_UYVYTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - UYVYToYRow = UYVYToYRow_Any_NEON; - UYVYToUVRow = UYVYToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_NEON; - UYVYToUVRow = UYVYToUVRow_NEON; - } - } -#endif -#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToYRow = UYVYToYRow_Any_MMI; - UYVYToUVRow = UYVYToUVRow_Any_MMI; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_MMI; - UYVYToUVRow = UYVYToUVRow_MMI; - } - } -#endif -#if defined(HAS_UYVYTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - UYVYToYRow = UYVYToYRow_Any_MSA; - UYVYToUVRow = UYVYToUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - UYVYToYRow = UYVYToYRow_MSA; - UYVYToUVRow = UYVYToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); - UYVYToYRow(src_uyvy, dst_y, width); - UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); - src_uyvy += src_stride_uyvy * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); - UYVYToYRow(src_uyvy, dst_y, width); - } - return 0; -} - -// Convert AYUV to NV12. -LIBYUV_API -int AYUVToNV12(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv, - uint8_t* dst_uv, int width) = AYUVToUVRow_C; - void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = - AYUVToYRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; - src_stride_ayuv = -src_stride_ayuv; - } -// place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToUVRow = AYUVToUVRow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - AYUVToUVRow = AYUVToUVRow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; - } - } -#endif -#if defined(HAS_AYUVTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - AYUVToUVRow = AYUVToUVRow_Any_AVX2; - AYUVToYRow = AYUVToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - AYUVToUVRow = AYUVToUVRow_AVX2; - AYUVToYRow = AYUVToYRow_AVX2; - } - } -#endif - -#if defined(HAS_AYUVTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - AYUVToYRow = AYUVToYRow_Any_NEON; - AYUVToUVRow = AYUVToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - AYUVToYRow = AYUVToYRow_NEON; - AYUVToUVRow = AYUVToUVRow_NEON; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); - AYUVToYRow(src_ayuv, dst_y, width); - AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); - src_ayuv += src_stride_ayuv * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - AYUVToUVRow(src_ayuv, 0, dst_uv, width); - AYUVToYRow(src_ayuv, dst_y, width); - } - return 0; -} - -// Convert AYUV to NV21. -LIBYUV_API -int AYUVToNV21(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int y; - void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, - uint8_t* dst_vu, int width) = AYUVToVURow_C; - void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = - AYUVToYRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; - src_stride_ayuv = -src_stride_ayuv; - } -// place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToVURow = AYUVToVURow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - AYUVToVURow = AYUVToVURow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; - } - } -#endif -#if defined(HAS_AYUVTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - AYUVToVURow = AYUVToVURow_Any_AVX2; - AYUVToYRow = AYUVToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - AYUVToVURow = AYUVToVURow_AVX2; - AYUVToYRow = AYUVToYRow_AVX2; - } - } -#endif - -#if defined(HAS_AYUVTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - AYUVToYRow = AYUVToYRow_Any_NEON; - AYUVToVURow = AYUVToVURow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - AYUVToYRow = AYUVToYRow_NEON; - AYUVToVURow = AYUVToVURow_NEON; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); - AYUVToYRow(src_ayuv, dst_y, width); - AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); - src_ayuv += src_stride_ayuv * 2; - dst_y += dst_stride_y * 2; - dst_vu += dst_stride_vu; - } - if (height & 1) { - AYUVToVURow(src_ayuv, 0, dst_vu, width); - AYUVToYRow(src_ayuv, dst_y, width); - } - return 0; -} - -// Convert ARGB to I420. -LIBYUV_API -int ARGBToI420(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - } - return 0; -} - -// Convert BGRA to I420. -LIBYUV_API -int BGRAToI420(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, - uint8_t* dst_u, uint8_t* dst_v, int width) = - BGRAToUVRow_C; - void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = - BGRAToYRow_C; - if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_bgra = src_bgra + (height - 1) * src_stride_bgra; - src_stride_bgra = -src_stride_bgra; - } -#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToUVRow = BGRAToUVRow_Any_SSSE3; - BGRAToYRow = BGRAToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; - BGRAToYRow = BGRAToYRow_SSSE3; - } - } -#endif -#if defined(HAS_BGRATOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - BGRAToYRow = BGRAToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - BGRAToYRow = BGRAToYRow_NEON; - } - } -#endif -#if defined(HAS_BGRATOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - BGRAToUVRow = BGRAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_NEON; - } - } -#endif -#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BGRAToYRow = BGRAToYRow_Any_MMI; - BGRAToUVRow = BGRAToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - BGRAToYRow = BGRAToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_MMI; - } - } -#endif -#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - BGRAToYRow = BGRAToYRow_Any_MSA; - BGRAToUVRow = BGRAToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - BGRAToYRow = BGRAToYRow_MSA; - BGRAToUVRow = BGRAToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); - BGRAToYRow(src_bgra, dst_y, width); - BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); - src_bgra += src_stride_bgra * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); - BGRAToYRow(src_bgra, dst_y, width); - } - return 0; -} - -// Convert ABGR to I420. -LIBYUV_API -int ABGRToI420(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = - ABGRToYRow_C; - if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } -#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; - ABGRToYRow = ABGRToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - ABGRToYRow = ABGRToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToUVRow = ABGRToUVRow_Any_AVX2; - ABGRToYRow = ABGRToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_AVX2; - ABGRToYRow = ABGRToYRow_AVX2; - } - } -#endif -#if defined(HAS_ABGRTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToYRow = ABGRToYRow_Any_MMI; - ABGRToUVRow = ABGRToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); - ABGRToYRow(src_abgr, dst_y, width); - ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); - src_abgr += src_stride_abgr * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); - ABGRToYRow(src_abgr, dst_y, width); - } - return 0; -} - -// Convert RGBA to I420. -LIBYUV_API -int RGBAToI420(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, - uint8_t* dst_u, uint8_t* dst_v, int width) = - RGBAToUVRow_C; - void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = - RGBAToYRow_C; - if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgba = src_rgba + (height - 1) * src_stride_rgba; - src_stride_rgba = -src_stride_rgba; - } -#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToUVRow = RGBAToUVRow_Any_SSSE3; - RGBAToYRow = RGBAToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; - RGBAToYRow = RGBAToYRow_SSSE3; - } - } -#endif -#if defined(HAS_RGBATOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGBAToYRow = RGBAToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGBAToYRow = RGBAToYRow_NEON; - } - } -#endif -#if defined(HAS_RGBATOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGBAToUVRow = RGBAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_NEON; - } - } -#endif -#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGBAToYRow = RGBAToYRow_Any_MMI; - RGBAToUVRow = RGBAToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGBAToYRow = RGBAToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_MMI; - } - } -#endif -#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGBAToYRow = RGBAToYRow_Any_MSA; - RGBAToUVRow = RGBAToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToYRow = RGBAToYRow_MSA; - RGBAToUVRow = RGBAToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); - RGBAToYRow(src_rgba, dst_y, width); - RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); - src_rgba += src_stride_rgba * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); - RGBAToYRow(src_rgba, dst_y, width); - } - return 0; -} - -// Convert RGB24 to I420. -LIBYUV_API -int RGB24ToI420(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) - void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8_t* dst_u, uint8_t* dst_v, int width) = - RGB24ToUVRow_C; - void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = - RGB24ToYRow_C; -#else - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; -#endif - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; - src_stride_rgb24 = -src_stride_rgb24; - } - -// Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToUVRow = RGB24ToUVRow_Any_NEON; - RGB24ToYRow = RGB24ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToYRow = RGB24ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } - } - } -// MMI and MSA version does direct RGB24 to YUV. -#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA)) -#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToUVRow = RGB24ToUVRow_Any_MMI; - RGB24ToYRow = RGB24ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYRow = RGB24ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToUVRow = RGB24ToUVRow_Any_MSA; - RGB24ToYRow = RGB24ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYRow = RGB24ToYRow_MSA; - RGB24ToUVRow = RGB24ToUVRow_MSA; - } - } -#endif -// Other platforms do intermediate conversion from RGB24 to ARGB. -#else -#if defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#endif - - { -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) - RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); - RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) - RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYRow(src_rgb24, dst_y, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// TODO(fbarchard): Use Matrix version to implement I420 and J420. -// Convert RGB24 to J420. -LIBYUV_API -int RGB24ToJ420(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI) - void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, - uint8_t* dst_u, uint8_t* dst_v, int width) = - RGB24ToUVJRow_C; - void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = - RGB24ToYJRow_C; -#else - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYJRow_C; -#endif - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; - src_stride_rgb24 = -src_stride_rgb24; - } - -// Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; - RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVJRow = RGB24ToUVJRow_NEON; - } - } - } -// MMI and MSA version does direct RGB24 to YUV. -#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA)) -#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI; - RGB24ToYJRow = RGB24ToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVJRow = RGB24ToUVJRow_MMI; - } - } - } -#endif -#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; - RGB24ToYJRow = RGB24ToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYJRow = RGB24ToYJRow_MSA; - RGB24ToUVJRow = RGB24ToUVJRow_MSA; - } - } -#endif -#else -#if defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } - } - } -#endif -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVJRow = ARGBToUVJRow_AVX2; - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#endif - - { -#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); - RGB24ToYJRow(src_rgb24, dst_y, width); - RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYJRow(row, dst_y, width); - ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_rgb24 += src_stride_rgb24 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) - RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); - RGB24ToYJRow(src_rgb24, dst_y, width); -#else - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToUVJRow(row, 0, dst_u, dst_v, width); - ARGBToYJRow(row, dst_y, width); -#endif - } -#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// Convert RAW to I420. -LIBYUV_API -int RAWToI420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \ - defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI) - void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, - uint8_t* dst_v, int width) = RAWToUVRow_C; - void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = - RAWToYRow_C; -#else - void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RAWToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; -#endif - if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - -// Neon version does direct RAW to YUV. -#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToUVRow = RAWToUVRow_Any_NEON; - RAWToYRow = RAWToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToYRow = RAWToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } - } - } -// MMI and MSA version does direct RAW to YUV. -#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA)) -#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToUVRow = RAWToUVRow_Any_MMI; - RAWToYRow = RAWToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RAWToYRow = RAWToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToUVRow = RAWToUVRow_Any_MSA; - RAWToYRow = RAWToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToYRow = RAWToYRow_MSA; - RAWToUVRow = RAWToUVRow_MSA; - } - } -#endif -// Other platforms do intermediate conversion from RAW to ARGB. -#else -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#endif - - { -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) - RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); - RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); -#else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) - RAWToUVRow(src_raw, 0, dst_u, dst_v, width); - RAWToYRow(src_raw, dst_y, width); -#else - RAWToARGBRow(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// TODO(fbarchard): Use Matrix version to implement I420 and J420. -// Convert RAW to J420. -LIBYUV_API -int RAWToJ420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ - defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI) - void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, - uint8_t* dst_u, uint8_t* dst_v, int width) = - RAWToUVJRow_C; - void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = - RAWToYJRow_C; -#else - void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RAWToARGBRow_C; - void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYJRow_C; -#endif - if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - -// Neon version does direct RAW to YUV. -#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToUVJRow = RAWToUVJRow_Any_NEON; - RAWToYJRow = RAWToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToYJRow = RAWToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVJRow = RAWToUVJRow_NEON; - } - } - } -// MMI and MSA version does direct RAW to YUV. -#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA)) -#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToUVJRow = RAWToUVJRow_Any_MMI; - RAWToYJRow = RAWToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RAWToYJRow = RAWToYJRow_MMI; - if (IS_ALIGNED(width, 16)) { - RAWToUVJRow = RAWToUVJRow_MMI; - } - } - } -#endif -#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToUVJRow = RAWToUVJRow_Any_MSA; - RAWToYJRow = RAWToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToYJRow = RAWToYJRow_MSA; - RAWToUVJRow = RAWToUVJRow_MSA; - } - } -#endif -#else -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVJRow = ARGBToUVJRow_AVX2; - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#endif - - { -#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ - defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ - defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) - RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); - RAWToYJRow(src_raw, dst_y, width); - RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); -#else - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYJRow(row, dst_y, width); - ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_raw += src_stride_raw * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ - defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) - RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); - RAWToYJRow(src_raw, dst_y, width); -#else - RAWToARGBRow(src_raw, row, width); - ARGBToUVJRow(row, 0, dst_u, dst_v, width); - ARGBToYJRow(row, dst_y, width); -#endif - } -#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ - defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// Convert RGB565 to I420. -LIBYUV_API -int RGB565ToI420(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) - void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, - uint8_t* dst_u, uint8_t* dst_v, int width) = - RGB565ToUVRow_C; - void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = - RGB565ToYRow_C; -#else - void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, - int width) = RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; -#endif - if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; - src_stride_rgb565 = -src_stride_rgb565; - } - -// Neon version does direct RGB565 to YUV. -#if defined(HAS_RGB565TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB565ToUVRow = RGB565ToUVRow_Any_NEON; - RGB565ToYRow = RGB565ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB565ToYRow = RGB565ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - RGB565ToUVRow = RGB565ToUVRow_NEON; - } - } - } -// MMI and MSA version does direct RGB565 to YUV. -#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA)) -#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB565ToUVRow = RGB565ToUVRow_Any_MMI; - RGB565ToYRow = RGB565ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB565ToYRow = RGB565ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - RGB565ToUVRow = RGB565ToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB565ToUVRow = RGB565ToUVRow_Any_MSA; - RGB565ToYRow = RGB565ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB565ToYRow = RGB565ToYRow_MSA; - RGB565ToUVRow = RGB565ToUVRow_MSA; - } - } -#endif -// Other platforms do intermediate conversion from RGB565 to ARGB. -#else -#if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#endif - { -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) - RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); - RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); -#else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) - RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); - RGB565ToYRow(src_rgb565, dst_y, width); -#else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ - defined(HAS_RGB565TOYROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// Convert ARGB1555 to I420. -LIBYUV_API -int ARGB1555ToI420(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) - void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGB1555ToUVRow_C; - void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, - int width) = ARGB1555ToYRow_C; -#else - void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, - int width) = ARGB1555ToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; -#endif - if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; - src_stride_argb1555 = -src_stride_argb1555; - } - -// Neon version does direct ARGB1555 to YUV. -#if defined(HAS_ARGB1555TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; - ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToYRow = ARGB1555ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; - } - } - } -// MMI and MSA version does direct ARGB1555 to YUV. -#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA)) -#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; - ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToYRow = ARGB1555ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; - ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToYRow = ARGB1555ToYRow_MSA; - ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; - } - } -#endif -// Other platforms do intermediate conversion from ARGB1555 to ARGB. -#else -#if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_ARGB1555TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#endif - { -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) - ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, - width); -#else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_argb1555 += src_stride_argb1555 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) - ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); - ARGB1555ToYRow(src_argb1555, dst_y, width); -#else - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ - defined(HAS_ARGB1555TOYROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// Convert ARGB4444 to I420. -LIBYUV_API -int ARGB4444ToI420(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) - void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGB4444ToUVRow_C; - void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, - int width) = ARGB4444ToYRow_C; -#else - void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, - int width) = ARGB4444ToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; -#endif - if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; - src_stride_argb4444 = -src_stride_argb4444; - } - -// Neon version does direct ARGB4444 to YUV. -#if defined(HAS_ARGB4444TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; - ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToYRow = ARGB4444ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; - } - } - } -#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; - ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToYRow = ARGB4444ToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_MMI; - } - } - } -// Other platforms do intermediate conversion from ARGB4444 to ARGB. -#else -#if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUVRow = ARGBToUVRow_Any_MMI; - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUVRow = ARGBToUVRow_Any_MSA; - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } - } -#endif -#endif - - { -#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); -#endif - - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) - ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, - width); -#else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, - width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); -#endif - src_argb4444 += src_stride_argb4444 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) - ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); - ARGB4444ToYRow(src_argb4444, dst_y, width); -#else - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} - -// Convert RGB24 to J400. -LIBYUV_API -int RGB24ToJ400(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height) { - int y; - void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = - RGB24ToYJRow_C; - if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; - src_stride_rgb24 = -src_stride_rgb24; - } - // Coalesce rows. - if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) { - width *= height; - height = 1; - src_stride_rgb24 = dst_stride_yj = 0; - } -#if defined(HAS_RGB24TOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24ToYJRow = RGB24ToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_RGB24TOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RGB24ToYJRow = RGB24ToYJRow_AVX2; - } - } -#endif -#if defined(HAS_RGB24TOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToYJRow = RGB24ToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToYJRow = RGB24ToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGB24ToYJRow = RGB24ToYJRow_MMI; - } - } -#endif -#if defined(HAS_RGB24TOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToYJRow = RGB24ToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToYJRow = RGB24ToYJRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RGB24ToYJRow(src_rgb24, dst_yj, width); - src_rgb24 += src_stride_rgb24; - dst_yj += dst_stride_yj; - } - return 0; -} - -// Convert RAW to J400. -LIBYUV_API -int RAWToJ400(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height) { - int y; - void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = - RAWToYJRow_C; - if (!src_raw || !dst_yj || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - // Coalesce rows. - if (src_stride_raw == width * 3 && dst_stride_yj == width) { - width *= height; - height = 1; - src_stride_raw = dst_stride_yj = 0; - } -#if defined(HAS_RAWTOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToYJRow = RAWToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToYJRow = RAWToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_RAWTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RAWToYJRow = RAWToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RAWToYJRow = RAWToYJRow_AVX2; - } - } -#endif -#if defined(HAS_RAWTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToYJRow = RAWToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToYJRow = RAWToYJRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToYJRow = RAWToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RAWToYJRow = RAWToYJRow_MMI; - } - } -#endif -#if defined(HAS_RAWTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToYJRow = RAWToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToYJRow = RAWToYJRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RAWToYJRow(src_raw, dst_yj, width); - src_raw += src_stride_raw; - dst_yj += dst_stride_yj; - } - return 0; -} - -static void SplitPixels(const uint8_t* src_u, - int src_pixel_stride_uv, - uint8_t* dst_u, - int width) { - int i; - for (i = 0; i < width; ++i) { - *dst_u = *src_u; - ++dst_u; - src_u += src_pixel_stride_uv; - } -} - -// Convert Android420 to I420. -LIBYUV_API -int Android420ToI420(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - const ptrdiff_t vu_off = src_v - src_u; - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // Copy UV planes as is - I420 - if (src_pixel_stride_uv == 1) { - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; - // Split UV planes - NV21 - } - if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { - SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, - halfwidth, halfheight); - return 0; - // Split UV planes - NV12 - } - if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { - SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, - halfwidth, halfheight); - return 0; - } - - for (y = 0; y < halfheight; ++y) { - SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); - SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); - src_u += src_stride_u; - src_v += src_stride_v; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_argb.cc b/thirdparty/libyuv/source/convert_argb.cc deleted file mode 100644 index d8f7b27..0000000 --- a/thirdparty/libyuv/source/convert_argb.cc +++ /dev/null @@ -1,5350 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert_argb.h" - -#include "libyuv/cpu_id.h" -#ifdef HAVE_JPEG -#include "libyuv/mjpeg_decoder.h" -#endif -#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. -#include "libyuv/rotate_argb.h" -#include "libyuv/row.h" -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Copy ARGB with optional flipping -LIBYUV_API -int ARGBCopy(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - - CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, - height); - return 0; -} - -// Convert I420 to ARGB with matrix. -LIBYUV_API -int I420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGBRow = I422ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGBRow = I422ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to ARGB. -LIBYUV_API -int I420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert I420 to ABGR. -LIBYUV_API -int I420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert J420 to ARGB. -LIBYUV_API -int J420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvJPEGConstants, width, height); -} - -// Convert J420 to ABGR. -LIBYUV_API -int J420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuJPEGConstants, // Use Yvu matrix - width, height); -} - -// Convert H420 to ARGB. -LIBYUV_API -int H420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvH709Constants, width, height); -} - -// Convert H420 to ABGR. -LIBYUV_API -int H420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert U420 to ARGB. -LIBYUV_API -int U420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuv2020Constants, width, height); -} - -// Convert U420 to ABGR. -LIBYUV_API -int U420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I420ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvu2020Constants, // Use Yvu matrix - width, height); -} - -// Convert I422 to ARGB with matrix. -LIBYUV_API -int I422ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && src_stride_u * 2 == width && - src_stride_v * 2 == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGBRow = I422ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGBRow = I422ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I422 to ARGB. -LIBYUV_API -int I422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert I422 to ABGR. -LIBYUV_API -int I422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert J422 to ARGB. -LIBYUV_API -int J422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvJPEGConstants, width, height); -} - -// Convert J422 to ABGR. -LIBYUV_API -int J422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuJPEGConstants, // Use Yvu matrix - width, height); -} - -// Convert H422 to ARGB. -LIBYUV_API -int H422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvH709Constants, width, height); -} - -// Convert H422 to ABGR. -LIBYUV_API -int H422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert U422 to ARGB. -LIBYUV_API -int U422ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuv2020Constants, width, height); -} - -// Convert U422 to ABGR. -LIBYUV_API -int U422ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I422ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvu2020Constants, // Use Yvu matrix - width, height); -} - -// Convert I444 to ARGB with matrix. -LIBYUV_API -int I444ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I444ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && src_stride_u == width && src_stride_v == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I444TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I444TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I444ToARGBRow = I444ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I444ToARGBRow = I444ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I444ToARGBRow = I444ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I444TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I444ToARGBRow = I444ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I444ToARGBRow = I444ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I444ToARGBRow = I444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I444 to ARGB. -LIBYUV_API -int I444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert I444 to ABGR. -LIBYUV_API -int I444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert J444 to ARGB. -LIBYUV_API -int J444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvJPEGConstants, width, height); -} - -// Convert J444 to ABGR. -LIBYUV_API -int J444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuJPEGConstants, // Use Yvu matrix - width, height); -} - -// Convert H444 to ARGB. -LIBYUV_API -int H444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvH709Constants, width, height); -} - -// Convert H444 to ABGR. -LIBYUV_API -int H444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert U444 to ARGB. -LIBYUV_API -int U444ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuv2020Constants, width, height); -} - -// Convert U444 to ABGR. -LIBYUV_API -int U444ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I444ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvu2020Constants, // Use Yvu matrix - width, height); -} - -// Convert 10 bit YUV to ARGB with matrix. -// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to -// multiply 10 bit yuv into high bits to allow any number of bits. -LIBYUV_API -int I010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I210ToAR30Row_C; - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_I210TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I210TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210ToAR30Row = I210ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210ToAR30Row = I210ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I010 to AR30. -LIBYUV_API -int I010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvI601Constants, width, height); -} - -// Convert H010 to AR30. -LIBYUV_API -int H010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvH709Constants, width, height); -} - -// Convert U010 to AR30. -LIBYUV_API -int U010ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuv2020Constants, width, height); -} - -// Convert I010 to AB30. -LIBYUV_API -int I010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuI601Constants, width, height); -} - -// Convert H010 to AB30. -LIBYUV_API -int H010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuH709Constants, width, height); -} - -// Convert U010 to AB30. -LIBYUV_API -int U010ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYuv2020Constants, width, height); -} - -// Convert 12 bit YUV to ARGB with matrix. -// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to -// multiply 12 bit yuv into high bits to allow any number of bits. -LIBYUV_API -int I012ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I212ToAR30Row_C; - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_I212TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToAR30Row = I212ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I212ToAR30Row = I212ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I212TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I212ToAR30Row = I212ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I212ToAR30Row = I212ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert 10 bit YUV to ARGB with matrix. -// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to -// multiply 10 bit yuv into high bits to allow any number of bits. -LIBYUV_API -int I210ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I210ToAR30Row_C; - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_I210TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I210TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210ToAR30Row = I210ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210ToAR30Row = I210ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I210 to AR30. -LIBYUV_API -int I210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvI601Constants, width, height); -} - -// Convert H210 to AR30. -LIBYUV_API -int H210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvH709Constants, width, height); -} - -// Convert U210 to AR30. -LIBYUV_API -int U210ToAR30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuv2020Constants, width, height); -} - -// Convert I210 to AB30. -LIBYUV_API -int I210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuI601Constants, width, height); -} - -// Convert H210 to AB30. -LIBYUV_API -int H210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuH709Constants, width, height); -} - -// Convert U210 to AB30. -LIBYUV_API -int U210ToAB30(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYuv2020Constants, width, height); -} - -LIBYUV_API -int I410ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I410ToAR30Row_C; - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_I410TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I410TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I410ToAR30Row = I410ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I410ToAR30Row = I410ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert 10 bit YUV to ARGB with matrix. -LIBYUV_API -int I010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I210ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I210TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I210TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210ToARGBRow = I210ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210ToARGBRow = I210ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I010 to ARGB. -LIBYUV_API -int I010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert I010 to ABGR. -LIBYUV_API -int I010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert H010 to ARGB. -LIBYUV_API -int H010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvH709Constants, width, height); -} - -// Convert H010 to ABGR. -LIBYUV_API -int H010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert U010 to ARGB. -LIBYUV_API -int U010ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuv2020Constants, width, height); -} - -// Convert U010 to ABGR. -LIBYUV_API -int U010ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I010ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvu2020Constants, // Use Yvu matrix - width, height); -} - -// Convert 12 bit YUV to ARGB with matrix. -LIBYUV_API -int I012ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I212ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I212TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToARGBRow = I212ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I212ToARGBRow = I212ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I212TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I212ToARGBRow = I212ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I212ToARGBRow = I212ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert 10 bit 422 YUV to ARGB with matrix. -LIBYUV_API -int I210ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I210ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I210TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I210TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210ToARGBRow = I210ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210ToARGBRow = I210ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I210 to ARGB. -LIBYUV_API -int I210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert I210 to ABGR. -LIBYUV_API -int I210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert H210 to ARGB. -LIBYUV_API -int H210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuvH709Constants, width, height); -} - -// Convert H210 to ABGR. -LIBYUV_API -int H210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert U210 to ARGB. -LIBYUV_API -int U210ToARGB(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - &kYuv2020Constants, width, height); -} - -// Convert U210 to ABGR. -LIBYUV_API -int U210ToABGR(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return I210ToARGBMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_abgr, dst_stride_abgr, - &kYvu2020Constants, // Use Yvu matrix - width, height); -} - -LIBYUV_API -int I410ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I410ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I410TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I410TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I410ToARGBRow = I410ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I410ToARGBRow = I410ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -LIBYUV_API -int P010ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*P210ToARGBRow)( - const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_P210TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_P210TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - P210ToARGBRow = P210ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - P210ToARGBRow = P210ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} - -LIBYUV_API -int P210ToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*P210ToARGBRow)( - const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_P210TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_P210TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - P210ToARGBRow = P210ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - P210ToARGBRow = P210ToARGBRow_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_uv += src_stride_uv; - } - return 0; -} - -LIBYUV_API -int P010ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*P210ToAR30Row)( - const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; - if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_P210TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_P210TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - P210ToAR30Row = P210ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - P210ToAR30Row = P210ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} - -LIBYUV_API -int P210ToAR30Matrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_uv, - int src_stride_uv, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*P210ToAR30Row)( - const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; - if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } -#if defined(HAS_P210TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_P210TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - P210ToAR30Row = P210ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - P210ToAR30Row = P210ToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - src_uv += src_stride_uv; - } - return 0; -} - -// Convert I420 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I420AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I422 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I422AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422ALPHATOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I444 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I444AlphaToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I444AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I444ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I444ALPHATOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I444ALPHATOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I444ALPHATOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I420 with Alpha to ARGB. -LIBYUV_API -int I420AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, src_a, src_stride_a, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, - height, attenuate); -} - -// Convert I420 with Alpha to ABGR. -LIBYUV_API -int I420AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate) { - return I420AlphaToARGBMatrix( - src_y, src_stride_y, src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); -} - -// Convert I422 with Alpha to ARGB. -LIBYUV_API -int I422AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate) { - return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, src_a, src_stride_a, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, - height, attenuate); -} - -// Convert I422 with Alpha to ABGR. -LIBYUV_API -int I422AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate) { - return I422AlphaToARGBMatrix( - src_y, src_stride_y, src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); -} - -// Convert I444 with Alpha to ARGB. -LIBYUV_API -int I444AlphaToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int attenuate) { - return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, src_a, src_stride_a, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, - height, attenuate); -} - -// Convert I444 with Alpha to ABGR. -LIBYUV_API -int I444AlphaToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height, - int attenuate) { - return I444AlphaToARGBMatrix( - src_y, src_stride_y, src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); -} - -// Convert I010 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I010AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I210AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I210ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I210 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I210AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I210AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I210ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I410 with Alpha to preattenuated ARGB with matrix. -LIBYUV_API -int I410AlphaToARGBMatrix(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height, - int attenuate) { - int y; - void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, - const uint16_t* v_buf, const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = I410AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I410ALPHATOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_argb, dst_argb, width); - } - dst_argb += dst_stride_argb; - src_a += src_stride_a; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I400 to ARGB with matrix. -LIBYUV_API -int I400ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I400ToARGBRow_C; - if (!src_y || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = dst_stride_argb = 0; - } -#if defined(HAS_I400TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I400ToARGBRow = I400ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_I400TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I400ToARGBRow = I400ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I400ToARGBRow = I400ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I400TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I400ToARGBRow = I400ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I400TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I400ToARGBRow = I400ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I400TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I400ToARGBRow = I400ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - I400ToARGBRow = I400ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - } - return 0; -} - -// Convert I400 to ARGB. -LIBYUV_API -int I400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, - &kYuvI601Constants, width, height); -} - -// Convert J400 to ARGB. -LIBYUV_API -int J400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = - J400ToARGBRow_C; - if (!src_y || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = dst_stride_argb = 0; - } -#if defined(HAS_J400TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - J400ToARGBRow = J400ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - J400ToARGBRow = J400ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_J400TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - J400ToARGBRow = J400ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - J400ToARGBRow = J400ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_J400TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - J400ToARGBRow = J400ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - J400ToARGBRow = J400ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_J400TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - J400ToARGBRow = J400ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - J400ToARGBRow = J400ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_J400TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - J400ToARGBRow = J400ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - J400ToARGBRow = J400ToARGBRow_MSA; - } - } -#endif - for (y = 0; y < height; ++y) { - J400ToARGBRow(src_y, dst_argb, width); - src_y += src_stride_y; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Shuffle table for converting BGRA to ARGB. -static const uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; - -// Shuffle table for converting ABGR to ARGB. -static const uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; - -// Shuffle table for converting RGBA to ARGB. -static const uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; - -// Shuffle table for converting AR64 to AB64. -static const uvec8 kShuffleMaskAR64ToAB64 = { - 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; - -// Convert BGRA to ARGB. -LIBYUV_API -int BGRAToARGB(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); -} - -// Convert ARGB to BGRA (same as BGRAToARGB). -LIBYUV_API -int ARGBToBGRA(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); -} - -// Convert ABGR to ARGB. -LIBYUV_API -int ABGRToARGB(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); -} - -// Convert ARGB to ABGR to (same as ABGRToARGB). -LIBYUV_API -int ARGBToABGR(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); -} - -// Convert RGBA to ARGB. -LIBYUV_API -int RGBAToARGB(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, - (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); -} - -// Convert AR64 To AB64. -LIBYUV_API -int AR64ToAB64(const uint16_t* src_ar64, - int src_stride_ar64, - uint16_t* dst_ab64, - int dst_stride_ab64, - int width, - int height) { - return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, - (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); -} - -// Convert RGB24 to ARGB. -LIBYUV_API -int RGB24ToARGB(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; - src_stride_rgb24 = -src_stride_rgb24; - } - // Coalesce rows. - if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_rgb24 = dst_stride_argb = 0; - } -#if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RGB24ToARGBRow = RGB24ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RGB24ToARGBRow(src_rgb24, dst_argb, width); - src_rgb24 += src_stride_rgb24; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert RAW to ARGB. -LIBYUV_API -int RAWToARGB(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RAWToARGBRow_C; - if (!src_raw || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - // Coalesce rows. - if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_raw = dst_stride_argb = 0; - } -#if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToARGBRow = RAWToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RAWToARGBRow = RAWToARGBRow_MMI; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToARGBRow = RAWToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RAWToARGBRow(src_raw, dst_argb, width); - src_raw += src_stride_raw; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert RAW to RGBA. -LIBYUV_API -int RAWToRGBA(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - int y; - void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) = - RAWToRGBARow_C; - if (!src_raw || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - // Coalesce rows. - if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) { - width *= height; - height = 1; - src_stride_raw = dst_stride_rgba = 0; - } -#if defined(HAS_RAWTORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGBARow = RAWToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RAWToRGBARow = RAWToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_RAWTORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToRGBARow = RAWToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToRGBARow = RAWToRGBARow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - RAWToRGBARow(src_raw, dst_rgba, width); - src_raw += src_stride_raw; - dst_rgba += dst_stride_rgba; - } - return 0; -} - -// Convert RGB565 to ARGB. -LIBYUV_API -int RGB565ToARGB(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, - int width) = RGB565ToARGBRow_C; - if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; - src_stride_rgb565 = -src_stride_rgb565; - } - // Coalesce rows. - if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_rgb565 = dst_stride_argb = 0; - } -#if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RGB565ToARGBRow = RGB565ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RGB565ToARGBRow(src_rgb565, dst_argb, width); - src_rgb565 += src_stride_rgb565; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert ARGB1555 to ARGB. -LIBYUV_API -int ARGB1555ToARGB(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, - int width) = ARGB1555ToARGBRow_C; - if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; - src_stride_argb1555 = -src_stride_argb1555; - } - // Coalesce rows. - if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb1555 = dst_stride_argb = 0; - } -#if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_ARGB1555TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGB1555TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_ARGB1555TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_ARGB1555TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGB1555ToARGBRow(src_argb1555, dst_argb, width); - src_argb1555 += src_stride_argb1555; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert ARGB4444 to ARGB. -LIBYUV_API -int ARGB4444ToARGB(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, - int width) = ARGB4444ToARGBRow_C; - if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; - src_stride_argb4444 = -src_stride_argb4444; - } - // Coalesce rows. - if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb4444 = dst_stride_argb = 0; - } -#if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_ARGB4444TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGB4444ToARGBRow(src_argb4444, dst_argb, width); - src_argb4444 += src_stride_argb4444; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert AR30 to ARGB. -LIBYUV_API -int AR30ToARGB(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; - src_stride_ar30 = -src_stride_ar30; - } - // Coalesce rows. - if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_ar30 = dst_stride_argb = 0; - } - for (y = 0; y < height; ++y) { - AR30ToARGBRow_C(src_ar30, dst_argb, width); - src_ar30 += src_stride_ar30; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert AR30 to ABGR. -LIBYUV_API -int AR30ToABGR(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - int y; - if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; - src_stride_ar30 = -src_stride_ar30; - } - // Coalesce rows. - if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { - width *= height; - height = 1; - src_stride_ar30 = dst_stride_abgr = 0; - } - for (y = 0; y < height; ++y) { - AR30ToABGRRow_C(src_ar30, dst_abgr, width); - src_ar30 += src_stride_ar30; - dst_abgr += dst_stride_abgr; - } - return 0; -} - -// Convert AR30 to AB30. -LIBYUV_API -int AR30ToAB30(const uint8_t* src_ar30, - int src_stride_ar30, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - int y; - if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; - src_stride_ar30 = -src_stride_ar30; - } - // Coalesce rows. - if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { - width *= height; - height = 1; - src_stride_ar30 = dst_stride_ab30 = 0; - } - for (y = 0; y < height; ++y) { - AR30ToAB30Row_C(src_ar30, dst_ab30, width); - src_ar30 += src_stride_ar30; - dst_ab30 += dst_stride_ab30; - } - return 0; -} - -// Convert AR64 to ARGB. -LIBYUV_API -int AR64ToARGB(const uint16_t* src_ar64, - int src_stride_ar64, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, - int width) = AR64ToARGBRow_C; - if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; - src_stride_ar64 = -src_stride_ar64; - } - // Coalesce rows. - if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_ar64 = dst_stride_argb = 0; - } -#if defined(HAS_AR64TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - AR64ToARGBRow = AR64ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_AR64TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - AR64ToARGBRow = AR64ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_AR64TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - AR64ToARGBRow = AR64ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - AR64ToARGBRow = AR64ToARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - AR64ToARGBRow(src_ar64, dst_argb, width); - src_ar64 += src_stride_ar64; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert AB64 to ARGB. -LIBYUV_API -int AB64ToARGB(const uint16_t* src_ab64, - int src_stride_ab64, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, - int width) = AB64ToARGBRow_C; - if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; - src_stride_ab64 = -src_stride_ab64; - } - // Coalesce rows. - if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_ab64 = dst_stride_argb = 0; - } -#if defined(HAS_AB64TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - AB64ToARGBRow = AB64ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_AB64TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - AB64ToARGBRow = AB64ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_AB64TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - AB64ToARGBRow = AB64ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - AB64ToARGBRow = AB64ToARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - AB64ToARGBRow(src_ab64, dst_argb, width); - src_ab64 += src_stride_ab64; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert NV12 to ARGB with matrix. -LIBYUV_API -int NV12ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*NV12ToARGBRow)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToARGBRow = NV12ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToARGBRow = NV12ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - NV12ToARGBRow = NV12ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_NV12TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToARGBRow = NV12ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} - -// Convert NV21 to ARGB with matrix. -LIBYUV_API -int NV21ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*NV21ToARGBRow)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; - if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } -#if defined(HAS_NV21TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_NV21TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV21ToARGBRow = NV21ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_NV21TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV21ToARGBRow = NV21ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_NV21TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV21ToARGBRow = NV21ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - NV21ToARGBRow = NV21ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_NV21TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV21ToARGBRow = NV21ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - if (y & 1) { - src_vu += src_stride_vu; - } - } - return 0; -} - -// Convert NV12 to ARGB. -LIBYUV_API -int NV12ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, height); -} - -// Convert NV21 to ARGB. -LIBYUV_API -int NV21ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, height); -} - -// Convert NV12 to ABGR. -// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix. -// To swap the UV use NV12 instead of NV21.LIBYUV_API -LIBYUV_API -int NV12ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, - dst_stride_abgr, &kYvuI601Constants, width, height); -} - -// Convert NV21 to ABGR. -LIBYUV_API -int NV21ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, - dst_stride_abgr, &kYvuI601Constants, width, height); -} - -// TODO(fbarchard): Consider SSSE3 2 step conversion. -// Convert NV12 to RGB24 with matrix. -LIBYUV_API -int NV12ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*NV12ToRGB24Row)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; - if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; - dst_stride_rgb24 = -dst_stride_rgb24; - } -#if defined(HAS_NV12TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB24Row = NV12ToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_NV12TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_NV12TORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - NV12ToRGB24Row = NV12ToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_NV12TORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB24Row = NV12ToRGB24Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); - dst_rgb24 += dst_stride_rgb24; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} - -// Convert NV21 to RGB24 with matrix. -LIBYUV_API -int NV21ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*NV21ToRGB24Row)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; - if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; - dst_stride_rgb24 = -dst_stride_rgb24; - } -#if defined(HAS_NV21TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV21ToRGB24Row = NV21ToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_NV21TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_NV21TORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - NV21ToRGB24Row = NV21ToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_NV21TORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - NV21ToRGB24Row = NV21ToRGB24Row_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); - dst_rgb24 += dst_stride_rgb24; - src_y += src_stride_y; - if (y & 1) { - src_vu += src_stride_vu; - } - } - return 0; -} - -// Convert NV12 to RGB24. -LIBYUV_API -int NV12ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, - dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, - width, height); -} - -// Convert NV21 to RGB24. -LIBYUV_API -int NV21ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, - dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, - width, height); -} - -// Convert NV12 to RAW. -LIBYUV_API -int NV12ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw, - dst_stride_raw, &kYvuI601Constants, width, height); -} - -// Convert NV21 to RAW. -LIBYUV_API -int NV21ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw, - dst_stride_raw, &kYvuI601Constants, width, height); -} - -// Convert NV21 to YUV24 -int NV21ToYUV24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_yuv24, - int dst_stride_yuv24, - int width, - int height) { - int y; - void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu, - uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C; - if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24; - dst_stride_yuv24 = -dst_stride_yuv24; - } -#if defined(HAS_NV21TOYUV24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - NV21ToYUV24Row = NV21ToYUV24Row_NEON; - } - } -#endif -#if defined(HAS_NV21TOYUV24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - NV21ToYUV24Row = NV21ToYUV24Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width); - dst_yuv24 += dst_stride_yuv24; - src_y += src_stride_y; - if (y & 1) { - src_vu += src_stride_vu; - } - } - return 0; -} - -// Convert YUY2 to ARGB. -LIBYUV_API -int YUY2ToARGB(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, int width) = - YUY2ToARGBRow_C; - if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; - } - // Coalesce rows. - if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_yuy2 = dst_stride_argb = 0; - } -#if defined(HAS_YUY2TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_YUY2TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - YUY2ToARGBRow = YUY2ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_YUY2TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - YUY2ToARGBRow = YUY2ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_YUY2TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - YUY2ToARGBRow = YUY2ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_YUY2TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - YUY2ToARGBRow = YUY2ToARGBRow_MSA; - } - } -#endif - for (y = 0; y < height; ++y) { - YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); - src_yuy2 += src_stride_yuy2; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert UYVY to ARGB. -LIBYUV_API -int UYVYToARGB(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, int width) = - UYVYToARGBRow_C; - if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; - src_stride_uyvy = -src_stride_uyvy; - } - // Coalesce rows. - if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_uyvy = dst_stride_argb = 0; - } -#if defined(HAS_UYVYTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_UYVYTOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - UYVYToARGBRow = UYVYToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_UYVYTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - UYVYToARGBRow = UYVYToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - UYVYToARGBRow = UYVYToARGBRow_NEON; - } - } -#endif -#if defined(HAS_UYVYTOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToARGBRow = UYVYToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - UYVYToARGBRow = UYVYToARGBRow_MMI; - } - } -#endif -#if defined(HAS_UYVYTOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - UYVYToARGBRow = UYVYToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - UYVYToARGBRow = UYVYToARGBRow_MSA; - } - } -#endif - for (y = 0; y < height; ++y) { - UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); - src_uyvy += src_stride_uyvy; - dst_argb += dst_stride_argb; - } - return 0; -} -static void WeavePixels(const uint8_t* src_u, - const uint8_t* src_v, - int src_pixel_stride_uv, - uint8_t* dst_uv, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst_uv[0] = *src_u; - dst_uv[1] = *src_v; - dst_uv += 2; - src_u += src_pixel_stride_uv; - src_v += src_pixel_stride_uv; - } -} - -// Convert Android420 to ARGB with matrix. -LIBYUV_API -int Android420ToARGBMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - uint8_t* dst_uv; - const ptrdiff_t vu_off = src_v - src_u; - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - - // I420 - if (src_pixel_stride_uv == 1) { - return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_argb, dst_stride_argb, - yuvconstants, width, height); - // NV21 - } - if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { - return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, - dst_stride_argb, yuvconstants, width, height); - // NV12 - } - if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { - return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, - dst_stride_argb, yuvconstants, width, height); - } - - // General case fallback creates NV12 - align_buffer_64(plane_uv, halfwidth * 2 * halfheight); - dst_uv = plane_uv; - for (y = 0; y < halfheight; ++y) { - WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); - src_u += src_stride_u; - src_v += src_stride_v; - dst_uv += halfwidth * 2; - } - NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, - dst_stride_argb, yuvconstants, width, height); - free_aligned_buffer_64(plane_uv); - return 0; -} - -// Convert Android420 to ARGB. -LIBYUV_API -int Android420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, src_pixel_stride_uv, dst_argb, - dst_stride_argb, &kYuvI601Constants, width, - height); -} - -// Convert Android420 to ABGR. -LIBYUV_API -int Android420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_pixel_stride_uv, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height) { - return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, src_pixel_stride_uv, dst_abgr, - dst_stride_abgr, &kYvuI601Constants, width, - height); -} - -// Convert I422 to RGBA with matrix. -LIBYUV_API -int I422ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGBARow = I422ToRGBARow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGBARow = I422ToRGBARow_MMI; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert NV12 to RGB565 with matrix. -LIBYUV_API -int NV12ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*NV12ToRGB565Row)( - const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_NV12TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - NV12ToRGB565Row = NV12ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - NV12ToRGB565Row = NV12ToRGB565Row_MMI; - } - } -#endif -#if defined(HAS_NV12TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_uv += src_stride_uv; - } - } - return 0; -} - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv, - dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, - width, height); -} - -// Convert I422 to RGBA with matrix. -LIBYUV_API -int I420ToRGBAMatrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; - dst_stride_rgba = -dst_stride_rgba; - } -#if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGBAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGBARow = I422ToRGBARow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_AVX2; - } - } -#endif -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGBARow = I422ToRGBARow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGBARow = I422ToRGBARow_MMI; - } - } -#endif -#if defined(HAS_I422TORGBAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGBARow = I422ToRGBARow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); - dst_rgba += dst_stride_rgba; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGBA. -LIBYUV_API -int I420ToRGBA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgba, dst_stride_rgba, - &kYuvI601Constants, width, height); -} - -// Convert I420 to BGRA. -LIBYUV_API -int I420ToBGRA(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_bgra, - int dst_stride_bgra, - int width, - int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_bgra, dst_stride_bgra, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to RGB24 with matrix. -LIBYUV_API -int I420ToRGB24Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; - dst_stride_rgb24 = -dst_stride_rgb24; - } -#if defined(HAS_I422TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToRGB24Row = I422ToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB24Row = I422ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB24Row = I422ToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGB24Row = I422ToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGB24Row = I422ToRGB24Row_MMI; - } - } -#endif -#if defined(HAS_I422TORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB24Row = I422ToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); - dst_rgb24 += dst_stride_rgb24; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB24. -LIBYUV_API -int I420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, width, height); -} - -// Convert I420 to RAW. -LIBYUV_API -int I420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuI601Constants, // Use Yvu matrix - width, height); -} - -// Convert J420 to RGB24. -LIBYUV_API -int J420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvJPEGConstants, width, height); -} - -// Convert J420 to RAW. -LIBYUV_API -int J420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuJPEGConstants, // Use Yvu matrix - width, height); -} - -// Convert H420 to RGB24. -LIBYUV_API -int H420ToRGB24(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb24, dst_stride_rgb24, - &kYuvH709Constants, width, height); -} - -// Convert H420 to RAW. -LIBYUV_API -int H420ToRAW(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, src_v, - src_stride_v, // Swap U and V - src_u, src_stride_u, dst_raw, dst_stride_raw, - &kYvuH709Constants, // Use Yvu matrix - width, height); -} - -// Convert I420 to ARGB1555. -LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height) { - int y; - void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; - dst_stride_argb1555 = -dst_stride_argb1555; - } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB1555Row = I422ToARGB1555Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGB1555Row = I422ToARGB1555Row_MMI; - } - } -#endif -#if defined(HAS_I422TOARGB1555ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, - width); - dst_argb1555 += dst_stride_argb1555; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to ARGB4444. -LIBYUV_API -int I420ToARGB4444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height) { - int y; - void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; - dst_stride_argb4444 = -dst_stride_argb4444; - } -#if defined(HAS_I422TOARGB4444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB4444Row = I422ToARGB4444Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_NEON; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGB4444Row = I422ToARGB4444Row_MMI; - } - } -#endif -#if defined(HAS_I422TOARGB4444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, - width); - dst_argb4444 += dst_stride_argb4444; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565 with specified color matrix. -LIBYUV_API -int I420ToRGB565Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToRGB565Row = I422ToRGB565Row_MMI; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to RGB565. -LIBYUV_API -int I420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvI601Constants, width, height); -} - -// Convert J420 to RGB565. -LIBYUV_API -int J420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvJPEGConstants, width, height); -} - -// Convert H420 to RGB565. -LIBYUV_API -int H420ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_rgb565, dst_stride_rgb565, - &kYuvH709Constants, width, height); -} - -// Convert I422 to RGB565. -LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } -#if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToRGB565Row = I422ToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB565Row = I422ToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_I422TORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToRGB565Row = I422ToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - -// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8_t kDither565_4x4[16] = { - 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, -}; - -// Convert I420 to RGB565 with dithering. -LIBYUV_API -int I420ToRGB565Dither(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height) { - int y; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = - ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; - dst_stride_rgb565 = -dst_stride_rgb565; - } - if (!dither4x4) { - dither4x4 = kDither565_4x4; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGBRow = I422ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I422ToARGBRow = I422ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; - } - } -#endif - { - // Allocate a row of argb. - align_buffer_64(row_argb, width * 4); - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); - ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), - width); - dst_rgb565 += dst_stride_rgb565; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - free_aligned_buffer_64(row_argb); - } - return 0; -} - -// Convert I420 to AR30 with matrix. -LIBYUV_API -int I420ToAR30Matrix(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - const struct YuvConstants* yuvconstants, - int width, - int height) { - int y; - void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, int width) = - I422ToAR30Row_C; - - if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } - -#if defined(HAS_I422TOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToAR30Row = I422ToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToAR30Row = I422ToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_I422TOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToAR30Row = I422ToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToAR30Row = I422ToAR30Row_AVX2; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; -} - -// Convert I420 to AR30. -LIBYUV_API -int I420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYuvI601Constants, width, height); -} - -// Convert H420 to AR30. -LIBYUV_API -int H420ToAR30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_ar30, dst_stride_ar30, - &kYvuH709Constants, width, height); -} - -// Convert I420 to AB30. -LIBYUV_API -int I420ToAB30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuI601Constants, width, height); -} - -// Convert H420 to AB30. -LIBYUV_API -int H420ToAB30(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, - int width, - int height) { - return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_ab30, dst_stride_ab30, - &kYvuH709Constants, width, height); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_from.cc b/thirdparty/libyuv/source/convert_from.cc deleted file mode 100644 index 687f0a7..0000000 --- a/thirdparty/libyuv/source/convert_from.cc +++ /dev/null @@ -1,855 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert_from.h" - -#include "libyuv/basic_types.h" -#include "libyuv/convert.h" // For I420Copy -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate.h" -#include "libyuv/row.h" -#include "libyuv/scale.h" // For ScalePlane() -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// I420 To any I4xx YUV format with mirroring. -// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane - -static int I420ToI4xx(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int src_y_width, - int src_y_height, - int dst_uv_width, - int dst_uv_height) { - const int dst_y_width = Abs(src_y_width); - const int dst_y_height = Abs(src_y_height); - const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); - const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); - if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || - dst_uv_height <= 0) { - return -1; - } - if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, - dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); - } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, - dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, - dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); - return 0; -} - -// Convert 8 bit YUV to 10 bit. -LIBYUV_API -int I420ToI010(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - // Convert Y plane. - Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, - height); - // Convert UV planes. - Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, - halfheight); - Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, - halfheight); - return 0; -} - -// Convert 8 bit YUV to 12 bit. -LIBYUV_API -int I420ToI012(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - // Convert Y plane. - Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width, - height); - // Convert UV planes. - Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth, - halfheight); - Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth, - halfheight); - return 0; -} - -// 420 chroma is 1/2 width, 1/2 height -// 422 chroma is 1/2 width, 1x height -LIBYUV_API -int I420ToI422(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - const int dst_uv_width = (Abs(width) + 1) >> 1; - const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, dst_uv_width, - dst_uv_height); -} - -// 420 chroma is 1/2 width, 1/2 height -// 444 chroma is 1x width, 1x height -LIBYUV_API -int I420ToI444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - const int dst_uv_width = Abs(width); - const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height, dst_uv_width, - dst_uv_height); -} - -// 420 chroma to 444 chroma, 10/12 bit version -LIBYUV_API -int I010ToI410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - if (width == 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), - SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width), - Abs(height), kFilterBilinear); - ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), - SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width), - Abs(height), kFilterBilinear); - return 0; -} - -// 422 chroma to 444 chroma, 10/12 bit version -LIBYUV_API -int I210ToI410(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height) { - if (width == 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, - dst_stride_u, Abs(width), Abs(height), kFilterBilinear); - ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, - dst_stride_v, Abs(width), Abs(height), kFilterBilinear); - return 0; -} - -// 422 chroma is 1/2 width, 1x height -// 444 chroma is 1x width, 1x height -LIBYUV_API -int I422ToI444(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - if (width == 0 || height == 0) { - return -1; - } - - if (dst_y) { - ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, - Abs(width), Abs(height), kFilterBilinear); - } - ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, - dst_stride_u, Abs(width), Abs(height), kFilterBilinear); - ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, - dst_stride_v, Abs(width), Abs(height), kFilterBilinear); - return 0; -} - -// Copy to I400. Source can be I420,422,444,400,NV12,NV21 -LIBYUV_API -int I400Copy(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - if (!src_y || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - return 0; -} - -LIBYUV_API -int I422ToYUY2(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height) { - int y; - void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_yuy2, int width) = - I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; - dst_stride_yuy2 = -dst_stride_yuy2; - } - // Coalesce rows. - if (src_stride_y == width && src_stride_u * 2 == width && - src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; - } -#if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToYUY2Row = I422ToYUY2Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - dst_yuy2 += dst_stride_yuy2; - } - return 0; -} - -LIBYUV_API -int I420ToYUY2(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height) { - int y; - void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_yuy2, int width) = - I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; - dst_stride_yuy2 = -dst_stride_yuy2; - } -#if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToYUY2Row = I422ToYUY2Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_NEON; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToYUY2Row = I422ToYUY2Row_MMI; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); - I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, - dst_yuy2 + dst_stride_yuy2, width); - src_y += src_stride_y * 2; - src_u += src_stride_u; - src_v += src_stride_v; - dst_yuy2 += dst_stride_yuy2 * 2; - } - if (height & 1) { - I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); - } - return 0; -} - -LIBYUV_API -int I422ToUYVY(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height) { - int y; - void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_uyvy, int width) = - I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; - dst_stride_uyvy = -dst_stride_uyvy; - } - // Coalesce rows. - if (src_stride_y == width && src_stride_u * 2 == width && - src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; - } -#if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToUYVYRow = I422ToUYVYRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_NEON; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - dst_uyvy += dst_stride_uyvy; - } - return 0; -} - -LIBYUV_API -int I420ToUYVY(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height) { - int y; - void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_uyvy, int width) = - I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; - dst_stride_uyvy = -dst_stride_uyvy; - } -#if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToUYVYRow = I422ToUYVYRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_NEON; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); - I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, - dst_uyvy + dst_stride_uyvy, width); - src_y += src_stride_y * 2; - src_u += src_stride_u; - src_v += src_stride_v; - dst_uyvy += dst_stride_uyvy * 2; - } - if (height & 1) { - I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); - } - return 0; -} - -LIBYUV_API -int I420ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int halfwidth = (width + 1) / 2; - int halfheight = (height + 1) / 2; - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, - halfwidth, halfheight); - return 0; -} - -LIBYUV_API -int I420ToNV21(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, - src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, - width, height); -} - -// Convert I420 to specified format -LIBYUV_API -int ConvertFromI420(const uint8_t* y, - int y_stride, - const uint8_t* u, - int u_stride, - const uint8_t* v, - int v_stride, - uint8_t* dst_sample, - int dst_sample_stride, - int width, - int height, - uint32_t fourcc) { - uint32_t format = CanonicalFourCC(fourcc); - int r = 0; - if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { - return -1; - } - switch (format) { - // Single plane formats - case FOURCC_YUY2: - r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, width, - height); - break; - case FOURCC_UYVY: - r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, width, - height); - break; - case FOURCC_RGBP: - r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, width, - height); - break; - case FOURCC_RGBO: - r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); - break; - case FOURCC_R444: - r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); - break; - case FOURCC_24BG: - r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, width, - height); - break; - case FOURCC_RAW: - r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, width, - height); - break; - case FOURCC_ARGB: - r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, width, - height); - break; - case FOURCC_BGRA: - r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, width, - height); - break; - case FOURCC_ABGR: - r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, width, - height); - break; - case FOURCC_RGBA: - r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, width, - height); - break; - case FOURCC_AR30: - r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, width, - height); - break; - case FOURCC_I400: - r = I400Copy(y, y_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width, width, - height); - break; - case FOURCC_NV12: { - uint8_t* dst_uv = dst_sample + width * height; - r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width, dst_uv, - dst_sample_stride ? dst_sample_stride : width, width, - height); - break; - } - case FOURCC_NV21: { - uint8_t* dst_vu = dst_sample + width * height; - r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride ? dst_sample_stride : width, dst_vu, - dst_sample_stride ? dst_sample_stride : width, width, - height); - break; - } - // Triplanar formats - case FOURCC_I420: - case FOURCC_YV12: { - dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; - int halfstride = (dst_sample_stride + 1) / 2; - int halfheight = (height + 1) / 2; - uint8_t* dst_u; - uint8_t* dst_v; - if (format == FOURCC_YV12) { - dst_v = dst_sample + dst_sample_stride * height; - dst_u = dst_v + halfstride * halfheight; - } else { - dst_u = dst_sample + dst_sample_stride * height; - dst_v = dst_u + halfstride * halfheight; - } - r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride, dst_u, halfstride, dst_v, halfstride, - width, height); - break; - } - case FOURCC_I422: - case FOURCC_YV16: { - dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; - int halfstride = (dst_sample_stride + 1) / 2; - uint8_t* dst_u; - uint8_t* dst_v; - if (format == FOURCC_YV16) { - dst_v = dst_sample + dst_sample_stride * height; - dst_u = dst_v + halfstride * height; - } else { - dst_u = dst_sample + dst_sample_stride * height; - dst_v = dst_u + halfstride * height; - } - r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride, dst_u, halfstride, dst_v, halfstride, - width, height); - break; - } - case FOURCC_I444: - case FOURCC_YV24: { - dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; - uint8_t* dst_u; - uint8_t* dst_v; - if (format == FOURCC_YV24) { - dst_v = dst_sample + dst_sample_stride * height; - dst_u = dst_v + dst_sample_stride * height; - } else { - dst_u = dst_sample + dst_sample_stride * height; - dst_v = dst_u + dst_sample_stride * height; - } - r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, - dst_sample_stride, dst_u, dst_sample_stride, dst_v, - dst_sample_stride, width, height); - break; - } - // Formats not supported - MJPG, biplanar, some rgb formats. - default: - return -1; // unknown fourcc - return failure code. - } - return r; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_from_argb.cc b/thirdparty/libyuv/source/convert_from_argb.cc deleted file mode 100644 index e146158..0000000 --- a/thirdparty/libyuv/source/convert_from_argb.cc +++ /dev/null @@ -1,2281 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert_from_argb.h" - -#include "libyuv/basic_types.h" -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// ARGB little endian (bgra in memory) to I444 -LIBYUV_API -int ARGBToI444(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, - uint8_t* dst_v, int width) = ARGBToUV444Row_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_y == width && - dst_stride_u == width && dst_stride_v == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOUV444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUV444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV444Row = ARGBToUV444Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToUV444Row = ARGBToUV444Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUV444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToUV444Row = ARGBToUV444Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToUV444Row = ARGBToUV444Row_MMI; - } - } -#endif -#if defined(HAS_ARGBTOUV444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToUV444Row = ARGBToUV444Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_MSA; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToUV444Row(src_argb, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// ARGB little endian (bgra in memory) to I422 -LIBYUV_API -int ARGBToI422(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_y == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif - -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif - -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -LIBYUV_API -int ARGBToNV12(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_uv, int width) = MergeUVRow_C; - if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow_ = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_ = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif - { - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); - - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - } - free_aligned_buffer_64(row_u); - } - return 0; -} - -// Same as NV12 but U and V swapped. -LIBYUV_API -int ARGBToNV21(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_vu, int width) = MergeUVRow_C; - if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow_ = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_ = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif - { - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); - - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_vu += dst_stride_vu; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); - ARGBToYRow(src_argb, dst_y, width); - } - free_aligned_buffer_64(row_u); - } - return 0; -} - -LIBYUV_API -int ABGRToNV12(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = - ABGRToYRow_C; - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_uv, int width) = MergeUVRow_C; - if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } -#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; - ABGRToYRow = ABGRToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - ABGRToYRow = ABGRToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToUVRow = ABGRToUVRow_Any_AVX2; - ABGRToYRow = ABGRToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_AVX2; - ABGRToYRow = ABGRToYRow_AVX2; - } - } -#endif -#if defined(HAS_ABGRTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToYRow = ABGRToYRow_Any_MMI; - ABGRToUVRow = ABGRToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow_ = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_ = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif - { - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); - - for (y = 0; y < height - 1; y += 2) { - ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ABGRToYRow(src_abgr, dst_y, width); - ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); - src_abgr += src_stride_abgr * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - ABGRToUVRow(src_abgr, 0, row_u, row_v, width); - MergeUVRow_(row_u, row_v, dst_uv, halfwidth); - ABGRToYRow(src_abgr, dst_y, width); - } - free_aligned_buffer_64(row_u); - } - return 0; -} - -// Same as NV12 but U and V swapped. -LIBYUV_API -int ABGRToNV21(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = - ABGRToYRow_C; - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_vu, int width) = MergeUVRow_C; - if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } -#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; - ABGRToYRow = ABGRToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - ABGRToYRow = ABGRToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToUVRow = ABGRToUVRow_Any_AVX2; - ABGRToYRow = ABGRToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_AVX2; - ABGRToYRow = ABGRToYRow_AVX2; - } - } -#endif -#if defined(HAS_ABGRTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToYRow = ABGRToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ABGRToYRow = ABGRToYRow_Any_MMI; - ABGRToUVRow = ABGRToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ABGRToYRow = ABGRToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_MMI; - } - } -#endif -#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ABGRToYRow = ABGRToYRow_Any_MSA; - ABGRToUVRow = ABGRToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_MSA; - } - } -#endif -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow_ = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_ = MergeUVRow_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow_ = MergeUVRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - MergeUVRow_ = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow_ = MergeUVRow_Any_MSA; - if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_MSA; - } - } -#endif - { - // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); - - for (y = 0; y < height - 1; y += 2) { - ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); - ABGRToYRow(src_abgr, dst_y, width); - ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); - src_abgr += src_stride_abgr * 2; - dst_y += dst_stride_y * 2; - dst_vu += dst_stride_vu; - } - if (height & 1) { - ABGRToUVRow(src_abgr, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); - ABGRToYRow(src_abgr, dst_y, width); - } - free_aligned_buffer_64(row_u); - } - return 0; -} - -// Convert ARGB to YUY2. -LIBYUV_API -int ARGBToYUY2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yuy2, - int dst_stride_yuy2, - int width, - int height) { - int y; - void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_yuy2, int width) = - I422ToYUY2Row_C; - - if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; - dst_stride_yuy2 = -dst_stride_yuy2; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { - width *= height; - height = 1; - src_stride_argb = dst_stride_yuy2 = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_AVX2; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToYUY2Row = I422ToYUY2Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_NEON; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToYUY2Row = I422ToYUY2Row_MMI; - } - } -#endif -#if defined(HAS_I422TOYUY2ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToYUY2Row = I422ToYUY2Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToYUY2Row = I422ToYUY2Row_MSA; - } - } -#endif - - { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8_t* row_u = row_y + ((width + 63) & ~63); - uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; - - for (y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - ARGBToYRow(src_argb, row_y, width); - I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); - src_argb += src_stride_argb; - dst_yuy2 += dst_stride_yuy2; - } - - free_aligned_buffer_64(row_y); - } - return 0; -} - -// Convert ARGB to UYVY. -LIBYUV_API -int ARGBToUYVY(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_uyvy, - int dst_stride_uyvy, - int width, - int height) { - int y; - void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, - const uint8_t* src_v, uint8_t* dst_uyvy, int width) = - I422ToUYVYRow_C; - - if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; - dst_stride_uyvy = -dst_stride_uyvy; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { - width *= height; - height = 1; - src_stride_argb = dst_stride_uyvy = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - ARGBToUVRow = ARGBToUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - ARGBToUVRow = ARGBToUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_MSA; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToUYVYRow = I422ToUYVYRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_NEON; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - I422ToUYVYRow = I422ToUYVYRow_MMI; - } - } -#endif -#if defined(HAS_I422TOUYVYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToUYVYRow = I422ToUYVYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - I422ToUYVYRow = I422ToUYVYRow_MSA; - } - } -#endif - - { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8_t* row_u = row_y + ((width + 63) & ~63); - uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; - - for (y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); - ARGBToYRow(src_argb, row_y, width); - I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); - src_argb += src_stride_argb; - dst_uyvy += dst_stride_uyvy; - } - - free_aligned_buffer_64(row_y); - } - return 0; -} - -// Convert ARGB to I400. -LIBYUV_API -int ARGBToI400(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYRow = ARGBToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYRow = ARGBToYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - } - return 0; -} - -// Shuffle table for converting ARGB to RGBA. -static const uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; - -// Convert ARGB to RGBA. -LIBYUV_API -int ARGBToRGBA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgba, - int dst_stride_rgba, - int width, - int height) { - return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, - (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); -} - -// Convert ARGB To RGB24. -LIBYUV_API -int ARGBToRGB24(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - int y; - void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = - ARGBToRGB24Row_C; - if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { - width *= height; - height = 1; - src_stride_argb = dst_stride_rgb24 = 0; - } -#if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTORGB24ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToRGB24Row = ARGBToRGB24Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; - if (IS_ALIGNED(width, 32)) { - ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; - } - } -#endif -#if defined(HAS_ARGBTORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB24Row = ARGBToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB24Row = ARGBToRGB24Row_MMI; - } - } -#endif -#if defined(HAS_ARGBTORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToRGB24Row(src_argb, dst_rgb24, width); - src_argb += src_stride_argb; - dst_rgb24 += dst_stride_rgb24; - } - return 0; -} - -// Convert ARGB To RAW. -LIBYUV_API -int ARGBToRAW(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_raw, - int dst_stride_raw, - int width, - int height) { - int y; - void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = - ARGBToRAWRow_C; - if (!src_argb || !dst_raw || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { - width *= height; - height = 1; - src_stride_argb = dst_stride_raw = 0; - } -#if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToRAWRow = ARGBToRAWRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTORAWROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToRAWRow = ARGBToRAWRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRAWRow = ARGBToRAWRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRAWRow = ARGBToRAWRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTORAWROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRAWRow = ARGBToRAWRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRAWRow = ARGBToRAWRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTORAWROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRAWRow = ARGBToRAWRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToRAWRow = ARGBToRAWRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToRAWRow(src_argb, dst_raw, width); - src_argb += src_stride_argb; - dst_raw += dst_stride_raw; - } - return 0; -} - -// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8_t kDither565_4x4[16] = { - 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, -}; - -// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). -LIBYUV_API -int ARGBToRGB565Dither(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - const uint8_t* dither4x4, - int width, - int height) { - int y; - void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = - ARGBToRGB565DitherRow_C; - if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - if (!dither4x4) { - dither4x4 = kDither565_4x4; - } -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToRGB565DitherRow(src_argb, dst_rgb565, - *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), - width); - src_argb += src_stride_argb; - dst_rgb565 += dst_stride_rgb565; - } - return 0; -} - -// Convert ARGB To RGB565. -// TODO(fbarchard): Consider using dither function low level with zeros. -LIBYUV_API -int ARGBToRGB565(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { - int y; - void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, - int width) = ARGBToRGB565Row_C; - if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { - width *= height; - height = 1; - src_stride_argb = dst_stride_rgb565 = 0; - } -#if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; - } - } -#endif -#if defined(HAS_ARGBTORGB565ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565Row = ARGBToRGB565Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565Row = ARGBToRGB565Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTORGB565ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_MMI; - } - } -#endif -#if defined(HAS_ARGBTORGB565ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToRGB565Row = ARGBToRGB565Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToRGB565Row(src_argb, dst_rgb565, width); - src_argb += src_stride_argb; - dst_rgb565 += dst_stride_rgb565; - } - return 0; -} - -// Convert ARGB To ARGB1555. -LIBYUV_API -int ARGBToARGB1555(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height) { - int y; - void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, - int width) = ARGBToARGB1555Row_C; - if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb1555 = 0; - } -#if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; - } - } -#endif -#if defined(HAS_ARGBTOARGB1555ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOARGB1555ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_MMI; - } - } -#endif -#if defined(HAS_ARGBTOARGB1555ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToARGB1555Row(src_argb, dst_argb1555, width); - src_argb += src_stride_argb; - dst_argb1555 += dst_stride_argb1555; - } - return 0; -} - -// Convert ARGB To ARGB4444. -LIBYUV_API -int ARGBToARGB4444(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb4444, - int dst_stride_argb4444, - int width, - int height) { - int y; - void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, - int width) = ARGBToARGB4444Row_C; - if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb4444 = 0; - } -#if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; - } - } -#endif -#if defined(HAS_ARGBTOARGB4444ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; - } - } -#endif -#if defined(HAS_ARGBTOARGB4444ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_MMI; - } - } -#endif -#if defined(HAS_ARGBTOARGB4444ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToARGB4444Row(src_argb, dst_argb4444, width); - src_argb += src_stride_argb; - dst_argb4444 += dst_stride_argb4444; - } - return 0; -} - -// Convert ABGR To AR30. -LIBYUV_API -int ABGRToAR30(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - int y; - void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = - ABGRToAR30Row_C; - if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } - // Coalesce rows. - if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { - width *= height; - height = 1; - src_stride_abgr = dst_stride_ar30 = 0; - } -#if defined(HAS_ABGRTOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ABGRToAR30Row = ABGRToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ABGRToAR30Row = ABGRToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - ABGRToAR30Row(src_abgr, dst_ar30, width); - src_abgr += src_stride_abgr; - dst_ar30 += dst_stride_ar30; - } - return 0; -} - -// Convert ARGB To AR30. -LIBYUV_API -int ARGBToAR30(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height) { - int y; - void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = - ARGBToAR30Row_C; - if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_ar30 = 0; - } -#if defined(HAS_ARGBTOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAR30Row = ARGBToAR30Row_AVX2; - } - } -#endif - for (y = 0; y < height; ++y) { - ARGBToAR30Row(src_argb, dst_ar30, width); - src_argb += src_stride_argb; - dst_ar30 += dst_stride_ar30; - } - return 0; -} - -// Convert ARGB to J420. (JPeg full range I420). -LIBYUV_API -int ARGBToJ420(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - ARGBToUVJRow = ARGBToUVJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVJRow = ARGBToUVJRow_MSA; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); - ARGBToYJRow(src_argb, dst_yj, width); - ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); - src_argb += src_stride_argb * 2; - dst_yj += dst_stride_yj * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYJRow(src_argb, dst_yj, width); - } - return 0; -} - -// Convert ARGB to J422. (JPeg full range I422). -LIBYUV_API -int ARGBToJ422(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = - ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_yj == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - ARGBToUVJRow = ARGBToUVJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - ARGBToUVJRow = ARGBToUVJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - if (IS_ALIGNED(width, 32)) { - ARGBToUVJRow = ARGBToUVJRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYJRow(src_argb, dst_yj, width); - src_argb += src_stride_argb; - dst_yj += dst_stride_yj; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// Convert ARGB to AR64. -LIBYUV_API -int ARGBToAR64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height) { - int y; - void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAR64Row_C; - if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_ar64 = 0; - } -#if defined(HAS_ARGBTOAR64ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR64Row = ARGBToAR64Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAR64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOAR64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAR64Row = ARGBToAR64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToAR64Row(src_argb, dst_ar64, width); - src_argb += src_stride_argb; - dst_ar64 += dst_stride_ar64; - } - return 0; -} - -// Convert ARGB to AB64. -LIBYUV_API -int ARGBToAB64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ab64, - int dst_stride_ab64, - int width, - int height) { - int y; - void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAB64Row_C; - if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_ab64 = 0; - } -#if defined(HAS_ARGBTOAB64ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAB64Row = ARGBToAB64Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAB64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOAB64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAB64Row = ARGBToAB64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToAB64Row(src_argb, dst_ab64, width); - src_argb += src_stride_argb; - dst_ab64 += dst_stride_ab64; - } - return 0; -} - -// Convert ARGB to J400. -LIBYUV_API -int ARGBToJ400(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height) { - int y; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; - if (!src_argb || !dst_yj || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_yj == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_yj = 0; - } -#if defined(HAS_ARGBTOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToYJRow(src_argb, dst_yj, width); - src_argb += src_stride_argb; - dst_yj += dst_stride_yj; - } - return 0; -} - -// Convert RGBA to J400. -LIBYUV_API -int RGBAToJ400(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height) { - int y; - void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = - RGBAToYJRow_C; - if (!src_rgba || !dst_yj || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_rgba = src_rgba + (height - 1) * src_stride_rgba; - src_stride_rgba = -src_stride_rgba; - } - // Coalesce rows. - if (src_stride_rgba == width * 4 && dst_stride_yj == width) { - width *= height; - height = 1; - src_stride_rgba = dst_stride_yj = 0; - } -#if defined(HAS_RGBATOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYJRow = RGBAToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_RGBATOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGBAToYJRow = RGBAToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RGBAToYJRow = RGBAToYJRow_AVX2; - } - } -#endif -#if defined(HAS_RGBATOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGBAToYJRow = RGBAToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGBAToYJRow = RGBAToYJRow_NEON; - } - } -#endif -#if defined(HAS_RGBATOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RGBAToYJRow = RGBAToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - RGBAToYJRow = RGBAToYJRow_MMI; - } - } -#endif -#if defined(HAS_RGBATOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RGBAToYJRow = RGBAToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RGBAToYJRow(src_rgba, dst_yj, width); - src_rgba += src_stride_rgba; - dst_yj += dst_stride_yj; - } - return 0; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_jpeg.cc b/thirdparty/libyuv/source/convert_jpeg.cc deleted file mode 100644 index d7556ee..0000000 --- a/thirdparty/libyuv/source/convert_jpeg.cc +++ /dev/null @@ -1,602 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert.h" -#include "libyuv/convert_argb.h" - -#ifdef HAVE_JPEG -#include "libyuv/mjpeg_decoder.h" -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#ifdef HAVE_JPEG -struct I420Buffers { - uint8_t* y; - int y_stride; - uint8_t* u; - int u_stride; - uint8_t* v; - int v_stride; - int w; - int h; -}; - -static void JpegCopyI420(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, - dest->v_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI422ToI420(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, - dest->v_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI444ToI420(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, - dest->v_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI400ToI420(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, - dest->u_stride, dest->v, dest->v_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -// Query size of MJPG in pixels. -LIBYUV_API -int MJPGSize(const uint8_t* src_mjpg, - size_t src_size_mjpg, - int* width, - int* height) { - MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); - if (ret) { - *width = mjpeg_decoder.GetWidth(); - *height = mjpeg_decoder.GetHeight(); - } - mjpeg_decoder.UnloadFrame(); - return ret ? 0 : -1; // -1 for runtime failure. -} - -// MJPG (Motion JPeg) to I420 -// TODO(fbarchard): review src_width and src_height requirement. dst_width and -// dst_height may be enough. -LIBYUV_API -int MJPGToI420(const uint8_t* src_mjpg, - size_t src_size_mjpg, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int src_width, - int src_height, - int dst_width, - int dst_height) { - if (src_size_mjpg == kUnknownDataSize) { - // ERROR: MJPEG frame size unknown - return -1; - } - - // TODO(fbarchard): Port MJpeg to C. - MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); - if (ret && (mjpeg_decoder.GetWidth() != src_width || - mjpeg_decoder.GetHeight() != src_height)) { - // ERROR: MJPEG frame has unexpected dimensions - mjpeg_decoder.UnloadFrame(); - return 1; // runtime failure - } - if (ret) { - I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, dst_width, dst_height}; - // YUV420 - if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 2 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, - dst_height); - // YUV422 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, - dst_height); - // YUV444 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, - dst_height); - // YUV400 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceGrayscale && - mjpeg_decoder.GetNumComponents() == 1 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, - dst_height); - } else { - // TODO(fbarchard): Implement conversion for any other - // colorspace/subsample factors that occur in practice. ERROR: Unable to - // convert MJPEG frame because format is not supported - mjpeg_decoder.UnloadFrame(); - return 1; - } - } - return ret ? 0 : 1; -} - -struct NV21Buffers { - uint8_t* y; - int y_stride; - uint8_t* vu; - int vu_stride; - int w; - int h; -}; - -static void JpegI420ToNV21(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI422ToNV21(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI444ToNV21(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI400ToNV21(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, - dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -// MJPG (Motion JPeg) to NV21 -LIBYUV_API -int MJPGToNV21(const uint8_t* src_mjpg, - size_t src_size_mjpg, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int src_width, - int src_height, - int dst_width, - int dst_height) { - if (src_size_mjpg == kUnknownDataSize) { - // ERROR: MJPEG frame size unknown - return -1; - } - - // TODO(fbarchard): Port MJpeg to C. - MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); - if (ret && (mjpeg_decoder.GetWidth() != src_width || - mjpeg_decoder.GetHeight() != src_height)) { - // ERROR: MJPEG frame has unexpected dimensions - mjpeg_decoder.UnloadFrame(); - return 1; // runtime failure - } - if (ret) { - NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu, - dst_stride_vu, dst_width, dst_height}; - // YUV420 - if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 2 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width, - dst_height); - // YUV422 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width, - dst_height); - // YUV444 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width, - dst_height); - // YUV400 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceGrayscale && - mjpeg_decoder.GetNumComponents() == 1 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width, - dst_height); - } else { - // Unknown colorspace. - mjpeg_decoder.UnloadFrame(); - return 1; - } - } - return ret ? 0 : 1; -} - -static void JpegI420ToNV12(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - // Use NV21 with VU swapped. - I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI422ToNV12(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - // Use NV21 with VU swapped. - I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI444ToNV12(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - // Use NV21 with VU swapped. - I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], - dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -static void JpegI400ToNV12(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - NV21Buffers* dest = (NV21Buffers*)(opaque); - // Use NV21 since there is no UV plane. - I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, - dest->vu_stride, dest->w, rows); - dest->y += rows * dest->y_stride; - dest->vu += ((rows + 1) >> 1) * dest->vu_stride; - dest->h -= rows; -} - -// MJPG (Motion JPEG) to NV12. -LIBYUV_API -int MJPGToNV12(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int src_width, - int src_height, - int dst_width, - int dst_height) { - if (sample_size == kUnknownDataSize) { - // ERROR: MJPEG frame size unknown - return -1; - } - - // TODO(fbarchard): Port MJpeg to C. - MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != src_width || - mjpeg_decoder.GetHeight() != src_height)) { - // ERROR: MJPEG frame has unexpected dimensions - mjpeg_decoder.UnloadFrame(); - return 1; // runtime failure - } - if (ret) { - // Use NV21Buffers but with UV instead of VU. - NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv, - dst_stride_uv, dst_width, dst_height}; - // YUV420 - if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 2 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width, - dst_height); - // YUV422 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width, - dst_height); - // YUV444 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width, - dst_height); - // YUV400 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceGrayscale && - mjpeg_decoder.GetNumComponents() == 1 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width, - dst_height); - } else { - // Unknown colorspace. - mjpeg_decoder.UnloadFrame(); - return 1; - } - } - return ret ? 0 : 1; -} - -struct ARGBBuffers { - uint8_t* argb; - int argb_stride; - int w; - int h; -}; - -static void JpegI420ToARGB(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->argb, dest->argb_stride, dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI422ToARGB(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->argb, dest->argb_stride, dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI444ToARGB(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], - dest->argb, dest->argb_stride, dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI400ToARGB(void* opaque, - const uint8_t* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -// MJPG (Motion JPeg) to ARGB -// TODO(fbarchard): review src_width and src_height requirement. dst_width and -// dst_height may be enough. -LIBYUV_API -int MJPGToARGB(const uint8_t* src_mjpg, - size_t src_size_mjpg, - uint8_t* dst_argb, - int dst_stride_argb, - int src_width, - int src_height, - int dst_width, - int dst_height) { - if (src_size_mjpg == kUnknownDataSize) { - // ERROR: MJPEG frame size unknown - return -1; - } - - // TODO(fbarchard): Port MJpeg to C. - MJpegDecoder mjpeg_decoder; - LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); - if (ret && (mjpeg_decoder.GetWidth() != src_width || - mjpeg_decoder.GetHeight() != src_height)) { - // ERROR: MJPEG frame has unexpected dimensions - mjpeg_decoder.UnloadFrame(); - return 1; // runtime failure - } - if (ret) { - ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; - // YUV420 - if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 2 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, - dst_height); - // YUV422 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 2 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, - dst_height); - // YUV444 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, - dst_height); - // YUV400 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceGrayscale && - mjpeg_decoder.GetNumComponents() == 1 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, - dst_height); - } else { - // TODO(fbarchard): Implement conversion for any other - // colorspace/subsample factors that occur in practice. ERROR: Unable to - // convert MJPEG frame because format is not supported - mjpeg_decoder.UnloadFrame(); - return 1; - } - } - return ret ? 0 : 1; -} - -#endif // HAVE_JPEG - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_to_argb.cc b/thirdparty/libyuv/source/convert_to_argb.cc deleted file mode 100644 index 84df16c..0000000 --- a/thirdparty/libyuv/source/convert_to_argb.cc +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/convert_argb.h" - -#include "libyuv/cpu_id.h" -#ifdef HAVE_JPEG -#include "libyuv/mjpeg_decoder.h" -#endif -#include "libyuv/rotate_argb.h" -#include "libyuv/row.h" -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Convert camera sample to ARGB with cropping, rotation and vertical flip. -// src_width is used for source stride computation -// src_height is used to compute location of planes, and indicate inversion -// sample_size is measured in bytes and is the size of the frame. -// With MJPEG it is the compressed size of the frame. - -// TODO(fbarchard): Add the following: -// H010ToARGB -// I010ToARGB - -LIBYUV_API -int ConvertToARGB(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_argb, - int dst_stride_argb, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc) { - uint32_t format = CanonicalFourCC(fourcc); - int aligned_src_width = (src_width + 1) & ~1; - const uint8_t* src; - const uint8_t* src_uv; - int abs_src_height = (src_height < 0) ? -src_height : src_height; - int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; - int r = 0; - - // One pass rotation is available for some formats. For the rest, convert - // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, - // and then rotate the ARGB to the final destination buffer. - // For in-place conversion, if destination dst_argb is same as source sample, - // also enable temporary buffer. - LIBYUV_BOOL need_buf = - (rotation && format != FOURCC_ARGB) || dst_argb == sample; - uint8_t* dest_argb = dst_argb; - int dest_dst_stride_argb = dst_stride_argb; - uint8_t* rotate_buffer = NULL; - int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - - if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || - src_height == 0 || crop_height == 0) { - return -1; - } - if (src_height < 0) { - inv_crop_height = -inv_crop_height; - } - - if (need_buf) { - int argb_size = crop_width * 4 * abs_crop_height; - rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ - if (!rotate_buffer) { - return 1; // Out of memory runtime error. - } - dst_argb = rotate_buffer; - dst_stride_argb = crop_width * 4; - } - - switch (format) { - // Single plane formats - case FOURCC_YUY2: - src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - break; - case FOURCC_UYVY: - src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - break; - case FOURCC_24BG: - src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_RAW: - src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_ARGB: - if (!need_buf && !rotation) { - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - } - break; - case FOURCC_BGRA: - src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_ABGR: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_RGBA: - src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_AR30: - src = sample + (src_width * crop_y + crop_x) * 4; - r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_AB30: - src = sample + (src_width * crop_y + crop_x) * 4; - r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_RGBP: - src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - break; - case FOURCC_RGBO: - src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - break; - case FOURCC_R444: - src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, - crop_width, inv_crop_height); - break; - case FOURCC_I400: - src = sample + src_width * crop_y + crop_x; - r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - case FOURCC_J400: - src = sample + src_width * crop_y + crop_x; - r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, - inv_crop_height); - break; - - // Biplanar formats - case FOURCC_NV12: - src = sample + (src_width * crop_y + crop_x); - src_uv = - sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; - r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, - dst_stride_argb, crop_width, inv_crop_height); - break; - case FOURCC_NV21: - src = sample + (src_width * crop_y + crop_x); - src_uv = - sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; - // Call NV12 but with u and v parameters swapped. - r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, - dst_stride_argb, crop_width, inv_crop_height); - break; - // Triplanar formats - case FOURCC_I420: - case FOURCC_YV12: { - const uint8_t* src_y = sample + (src_width * crop_y + crop_x); - const uint8_t* src_u; - const uint8_t* src_v; - int halfwidth = (src_width + 1) / 2; - int halfheight = (abs_src_height + 1) / 2; - if (format == FOURCC_YV12) { - src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - } else { - src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - } - r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_J420: { - int halfwidth = (src_width + 1) / 2; - int halfheight = (abs_src_height + 1) / 2; - const uint8_t* src_y = sample + (src_width * crop_y + crop_x); - const uint8_t* src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_H420: { - int halfwidth = (src_width + 1) / 2; - int halfheight = (abs_src_height + 1) / 2; - const uint8_t* src_y = sample + (src_width * crop_y + crop_x); - const uint8_t* src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_U420: { - int halfwidth = (src_width + 1) / 2; - int halfheight = (abs_src_height + 1) / 2; - const uint8_t* src_y = sample + (src_width * crop_y + crop_x); - const uint8_t* src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_I422: - case FOURCC_YV16: { - int halfwidth = (src_width + 1) / 2; - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + halfwidth * crop_y + - crop_x / 2; - src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; - } else { - src_u = sample + src_width * abs_src_height + halfwidth * crop_y + - crop_x / 2; - src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; - } - r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_J422: { - int halfwidth = (src_width + 1) / 2; - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u = - sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; - r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_H422: { - int halfwidth = (src_width + 1) / 2; - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u = - sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; - r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_U422: { - int halfwidth = (src_width + 1) / 2; - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u = - sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; - const uint8_t* src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; - r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_I444: - case FOURCC_YV24: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - if (format == FOURCC_YV24) { - src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - } else { - src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - } - r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_J444: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_H444: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - - case FOURCC_U444: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, - dst_argb, dst_stride_argb, crop_width, inv_crop_height); - break; - } - -#ifdef HAVE_JPEG - case FOURCC_MJPG: - r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, - abs_src_height, crop_width, inv_crop_height); - break; -#endif - default: - r = -1; // unknown fourcc - return failure code. - } - - if (need_buf) { - if (!r) { - r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, - crop_width, abs_crop_height, rotation); - } - free(rotate_buffer); - } else if (rotation) { - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, - inv_crop_height, rotation); - } - - return r; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/convert_to_i420.cc b/thirdparty/libyuv/source/convert_to_i420.cc deleted file mode 100644 index ac6eeab..0000000 --- a/thirdparty/libyuv/source/convert_to_i420.cc +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "libyuv/convert.h" - -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Convert camera sample to I420 with cropping, rotation and vertical flip. -// src_width is used for source stride computation -// src_height is used to compute location of planes, and indicate inversion -// sample_size is measured in bytes and is the size of the frame. -// With MJPEG it is the compressed size of the frame. -LIBYUV_API -int ConvertToI420(const uint8_t* sample, - size_t sample_size, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc) { - uint32_t format = CanonicalFourCC(fourcc); - int aligned_src_width = (src_width + 1) & ~1; - const uint8_t* src; - const uint8_t* src_uv; - const int abs_src_height = (src_height < 0) ? -src_height : src_height; - // TODO(nisse): Why allow crop_height < 0? - const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - int r = 0; - LIBYUV_BOOL need_buf = - (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && - format != FOURCC_NV21 && format != FOURCC_YV12) || - dst_y == sample; - uint8_t* tmp_y = dst_y; - uint8_t* tmp_u = dst_u; - uint8_t* tmp_v = dst_v; - int tmp_y_stride = dst_stride_y; - int tmp_u_stride = dst_stride_u; - int tmp_v_stride = dst_stride_v; - uint8_t* rotate_buffer = NULL; - const int inv_crop_height = - (src_height < 0) ? -abs_crop_height : abs_crop_height; - - if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || - crop_width <= 0 || src_height == 0 || crop_height == 0) { - return -1; - } - - // One pass rotation is available for some formats. For the rest, convert - // to I420 (with optional vertical flipping) into a temporary I420 buffer, - // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination dst_y is same as source sample, - // also enable temporary buffer. - if (need_buf) { - int y_size = crop_width * abs_crop_height; - int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ - if (!rotate_buffer) { - return 1; // Out of memory runtime error. - } - dst_y = rotate_buffer; - dst_u = dst_y + y_size; - dst_v = dst_u + uv_size; - dst_stride_y = crop_width; - dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); - } - - switch (format) { - // Single plane formats - case FOURCC_YUY2: - src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_UYVY: - src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_RGBP: - src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_RGBO: - src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_R444: - src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_24BG: - src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_RAW: - src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_ARGB: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_BGRA: - src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_ABGR: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_RGBA: - src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - // TODO(fbarchard): Add AR30 and AB30 - case FOURCC_I400: - src = sample + src_width * crop_y + crop_x; - r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, crop_width, inv_crop_height); - break; - // Biplanar formats - case FOURCC_NV12: - src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * abs_src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, - dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, crop_width, inv_crop_height, rotation); - break; - case FOURCC_NV21: - src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * abs_src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - // Call NV12 but with dst_u and dst_v parameters swapped. - r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, - dst_stride_y, dst_v, dst_stride_v, dst_u, - dst_stride_u, crop_width, inv_crop_height, rotation); - break; - // Triplanar formats - case FOURCC_I420: - case FOURCC_YV12: { - const uint8_t* src_y = sample + (src_width * crop_y + crop_x); - const uint8_t* src_u; - const uint8_t* src_v; - int halfwidth = (src_width + 1) / 2; - int halfheight = (abs_src_height + 1) / 2; - if (format == FOURCC_YV12) { - src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + - (crop_x / 2); - src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); - } else { - src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + - (crop_x / 2); - src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); - } - r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, crop_width, inv_crop_height, rotation); - break; - } - case FOURCC_I422: - case FOURCC_YV16: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - int halfwidth = (src_width + 1) / 2; - if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + halfwidth * crop_y + - (crop_x / 2); - src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + (crop_x / 2); - } else { - src_u = sample + src_width * abs_src_height + halfwidth * crop_y + - (crop_x / 2); - src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + (crop_x / 2); - } - r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, - dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, crop_width, inv_crop_height); - break; - } - case FOURCC_I444: - case FOURCC_YV24: { - const uint8_t* src_y = sample + src_width * crop_y + crop_x; - const uint8_t* src_u; - const uint8_t* src_v; - if (format == FOURCC_YV24) { - src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - } else { - src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; - src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; - } - r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, - dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, crop_width, inv_crop_height); - break; - } -#ifdef HAVE_JPEG - case FOURCC_MJPG: - r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, src_width, - abs_src_height, crop_width, inv_crop_height); - break; -#endif - default: - r = -1; // unknown fourcc - return failure code. - } - - if (need_buf) { - if (!r) { - r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, - tmp_v, tmp_v_stride, crop_width, abs_crop_height, - rotation); - } - free(rotate_buffer); - } - - return r; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/cpu_id.cc b/thirdparty/libyuv/source/cpu_id.cc deleted file mode 100644 index fe89452..0000000 --- a/thirdparty/libyuv/source/cpu_id.cc +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/cpu_id.h" - -#if defined(_MSC_VER) -#include // For __cpuidex() -#endif -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ - defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) -#include // For _xgetbv() -#endif - -// For ArmCpuCaps() but unittested on all platforms -#include -#include - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// For functions that use the stack and have runtime checks for overflow, -// use SAFEBUFFERS to avoid additional check. -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \ - !defined(__clang__) -#define SAFEBUFFERS __declspec(safebuffers) -#else -#define SAFEBUFFERS -#endif - -// cpu_info_ variable for SIMD instruction sets detected. -LIBYUV_API int cpu_info_ = 0; - -// TODO(fbarchard): Consider using int for cpuid so casting is not needed. -// Low level cpuid for X86. -#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__)) && \ - !defined(__pnacl__) && !defined(__CLR_VER) -LIBYUV_API -void CpuId(int info_eax, int info_ecx, int* cpu_info) { -#if defined(_MSC_VER) -// Visual C version uses intrinsic or inline x86 assembly. -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - __cpuidex(cpu_info, info_eax, info_ecx); -#elif defined(_M_IX86) - __asm { - mov eax, info_eax - mov ecx, info_ecx - mov edi, cpu_info - cpuid - mov [edi], eax - mov [edi + 4], ebx - mov [edi + 8], ecx - mov [edi + 12], edx - } -#else // Visual C but not x86 - if (info_ecx == 0) { - __cpuid(cpu_info, info_eax); - } else { - cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; - } -#endif -// GCC version uses inline x86 assembly. -#else // defined(_MSC_VER) - int info_ebx, info_edx; - asm volatile( -#if defined(__i386__) && defined(__PIC__) - // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=D"(info_ebx), -#else - "cpuid \n" - : "=b"(info_ebx), -#endif // defined( __i386__) && defined(__PIC__) - "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); - cpu_info[0] = info_eax; - cpu_info[1] = info_ebx; - cpu_info[2] = info_ecx; - cpu_info[3] = info_edx; -#endif // defined(_MSC_VER) -} -#else // (defined(_M_IX86) || defined(_M_X64) ... -LIBYUV_API -void CpuId(int eax, int ecx, int* cpu_info) { - (void)eax; - (void)ecx; - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -} -#endif - -// For VS2010 and earlier emit can be used: -// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. -// __asm { -// xor ecx, ecx // xcr 0 -// xgetbv -// mov xcr0, eax -// } -// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. -// https://code.google.com/p/libyuv/issues/detail?id=529 -#if defined(_M_IX86) && (_MSC_VER < 1900) -#pragma optimize("g", off) -#endif -#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__)) && \ - !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) -// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -int GetXCR0() { - int xcr0 = 0; -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT -#elif defined(__i386__) || defined(__x86_64__) - asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); -#endif // defined(__i386__) || defined(__x86_64__) - return xcr0; -} -#else -// xgetbv unavailable to query for OSSave support. Return 0. -#define GetXCR0() 0 -#endif // defined(_M_IX86) || defined(_M_X64) .. -// Return optimization to previous setting. -#if defined(_M_IX86) && (_MSC_VER < 1900) -#pragma optimize("g", on) -#endif - -// based on libvpx arm_cpudetect.c -// For Arm, but public to allow testing on any CPU -LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { - char cpuinfo_line[512]; - FILE* f = fopen(cpuinfo_name, "r"); - if (!f) { - // Assume Neon if /proc/cpuinfo is unavailable. - // This will occur for Chrome sandbox for Pepper or Render process. - return kCpuHasNEON; - } - while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { - if (memcmp(cpuinfo_line, "Features", 8) == 0) { - char* p = strstr(cpuinfo_line, " neon"); - if (p && (p[5] == ' ' || p[5] == '\n')) { - fclose(f); - return kCpuHasNEON; - } - // aarch64 uses asimd for Neon. - p = strstr(cpuinfo_line, " asimd"); - if (p) { - fclose(f); - return kCpuHasNEON; - } - } - } - fclose(f); - return 0; -} - -// TODO(fbarchard): Consider read_msa_ir(). -LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { - char cpuinfo_line[512]; - int flag = 0x0; - FILE* f = fopen(cpuinfo_name, "r"); - if (!f) { - // Assume nothing if /proc/cpuinfo is unavailable. - // This will occur for Chrome sandbox for Pepper or Render process. - return 0; - } - while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { - if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { - // Workaround early kernel without mmi in ASEs line. - if (strstr(cpuinfo_line, "Loongson-3")) { - flag |= kCpuHasMMI; - } else if (strstr(cpuinfo_line, "Loongson-2K")) { - flag |= kCpuHasMMI | kCpuHasMSA; - } - } - if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { - if (strstr(cpuinfo_line, "loongson-mmi") && - strstr(cpuinfo_line, "loongson-ext")) { - flag |= kCpuHasMMI; - } - if (strstr(cpuinfo_line, "msa")) { - flag |= kCpuHasMSA; - } - // ASEs is the last line, so we can break here. - break; - } - } - fclose(f); - return flag; -} - -static SAFEBUFFERS int GetCpuFlags(void) { - int cpu_info = 0; -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ - defined(_M_IX86)) - int cpu_info0[4] = {0, 0, 0, 0}; - int cpu_info1[4] = {0, 0, 0, 0}; - int cpu_info7[4] = {0, 0, 0, 0}; - CpuId(0, 0, cpu_info0); - CpuId(1, 0, cpu_info1); - if (cpu_info0[0] >= 7) { - CpuId(7, 0, cpu_info7); - } - cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | - ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | - ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | - ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); - - // AVX requires OS saves YMM registers. - if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave - ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers - cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); - - // Detect AVX512bw - if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; - cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; - cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; - cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; - cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; - cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; - cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; - } - } -#endif -#if defined(__mips__) && defined(__linux__) - cpu_info = MipsCpuCaps("/proc/cpuinfo"); - cpu_info |= kCpuHasMIPS; -#endif -#if defined(__arm__) || defined(__aarch64__) -// gcc -mfpu=neon defines __ARM_NEON__ -// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. -// For Linux, /proc/cpuinfo can be tested but without that assume Neon. -#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) - cpu_info = kCpuHasNEON; -// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon -// flag in it. -// So for aarch64, neon enabling is hard coded here. -#endif -#if defined(__aarch64__) - cpu_info = kCpuHasNEON; -#else - // Linux arm parse text file for neon detect. - cpu_info = ArmCpuCaps("/proc/cpuinfo"); -#endif - cpu_info |= kCpuHasARM; -#endif // __arm__ - cpu_info |= kCpuInitialized; - return cpu_info; -} - -// Note that use of this function is not thread safe. -LIBYUV_API -int MaskCpuFlags(int enable_flags) { - int cpu_info = GetCpuFlags() & enable_flags; - SetCpuFlags(cpu_info); - return cpu_info; -} - -LIBYUV_API -int InitCpuFlags(void) { - return MaskCpuFlags(-1); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/mjpeg_decoder.cc b/thirdparty/libyuv/source/mjpeg_decoder.cc deleted file mode 100644 index adba832..0000000 --- a/thirdparty/libyuv/source/mjpeg_decoder.cc +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/mjpeg_decoder.h" - -#ifdef HAVE_JPEG -#include - -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -// Must be included before jpeglib. -#include -#define HAVE_SETJMP - -#if defined(_MSC_VER) -// disable warning 4324: structure was padded due to __declspec(align()) -#pragma warning(disable : 4324) -#endif - -#endif - -#include // For jpeglib.h. - -// C++ build requires extern C for jpeg internals. -#ifdef __cplusplus -extern "C" { -#endif - -#include - -#ifdef __cplusplus -} // extern "C" -#endif - -#include "libyuv/planar_functions.h" // For CopyPlane(). - -namespace libyuv { - -#ifdef HAVE_SETJMP -struct SetJmpErrorMgr { - jpeg_error_mgr base; // Must be at the top - jmp_buf setjmp_buffer; -}; -#endif - -const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; -const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; -const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; -const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; -const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; -const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; - -// Methods that are passed to jpeglib. -boolean fill_input_buffer(jpeg_decompress_struct* cinfo); -void init_source(jpeg_decompress_struct* cinfo); -void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT -void term_source(jpeg_decompress_struct* cinfo); -void ErrorHandler(jpeg_common_struct* cinfo); -void OutputHandler(jpeg_common_struct* cinfo); - -MJpegDecoder::MJpegDecoder() - : has_scanline_padding_(LIBYUV_FALSE), - num_outbufs_(0), - scanlines_(NULL), - scanlines_sizes_(NULL), - databuf_(NULL), - databuf_strides_(NULL) { - decompress_struct_ = new jpeg_decompress_struct; - source_mgr_ = new jpeg_source_mgr; -#ifdef HAVE_SETJMP - error_mgr_ = new SetJmpErrorMgr; - decompress_struct_->err = jpeg_std_error(&error_mgr_->base); - // Override standard exit()-based error handler. - error_mgr_->base.error_exit = &ErrorHandler; - error_mgr_->base.output_message = &OutputHandler; -#endif - decompress_struct_->client_data = NULL; - source_mgr_->init_source = &init_source; - source_mgr_->fill_input_buffer = &fill_input_buffer; - source_mgr_->skip_input_data = &skip_input_data; - source_mgr_->resync_to_restart = &jpeg_resync_to_restart; - source_mgr_->term_source = &term_source; - jpeg_create_decompress(decompress_struct_); - decompress_struct_->src = source_mgr_; - buf_vec_.buffers = &buf_; - buf_vec_.len = 1; -} - -MJpegDecoder::~MJpegDecoder() { - jpeg_destroy_decompress(decompress_struct_); - delete decompress_struct_; - delete source_mgr_; -#ifdef HAVE_SETJMP - delete error_mgr_; -#endif - DestroyOutputBuffers(); -} - -LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { - if (!ValidateJpeg(src, src_len)) { - return LIBYUV_FALSE; - } - - buf_.data = src; - buf_.len = static_cast(src_len); - buf_vec_.pos = 0; - decompress_struct_->client_data = &buf_vec_; -#ifdef HAVE_SETJMP - if (setjmp(error_mgr_->setjmp_buffer)) { - // We called jpeg_read_header, it experienced an error, and we called - // longjmp() and rewound the stack to here. Return error. - return LIBYUV_FALSE; - } -#endif - if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { - // ERROR: Bad MJPEG header - return LIBYUV_FALSE; - } - AllocOutputBuffers(GetNumComponents()); - for (int i = 0; i < num_outbufs_; ++i) { - int scanlines_size = GetComponentScanlinesPerImcuRow(i); - if (scanlines_sizes_[i] != scanlines_size) { - if (scanlines_[i]) { - delete scanlines_[i]; - } - scanlines_[i] = new uint8_t*[scanlines_size]; - scanlines_sizes_[i] = scanlines_size; - } - - // We allocate padding for the final scanline to pad it up to DCTSIZE bytes - // to avoid memory errors, since jpeglib only reads full MCUs blocks. For - // the preceding scanlines, the padding is not needed/wanted because the - // following addresses will already be valid (they are the initial bytes of - // the next scanline) and will be overwritten when jpeglib writes out that - // next scanline. - int databuf_stride = GetComponentStride(i); - int databuf_size = scanlines_size * databuf_stride; - if (databuf_strides_[i] != databuf_stride) { - if (databuf_[i]) { - delete databuf_[i]; - } - databuf_[i] = new uint8_t[databuf_size]; - databuf_strides_[i] = databuf_stride; - } - - if (GetComponentStride(i) != GetComponentWidth(i)) { - has_scanline_padding_ = LIBYUV_TRUE; - } - } - return LIBYUV_TRUE; -} - -static int DivideAndRoundUp(int numerator, int denominator) { - return (numerator + denominator - 1) / denominator; -} - -static int DivideAndRoundDown(int numerator, int denominator) { - return numerator / denominator; -} - -// Returns width of the last loaded frame. -int MJpegDecoder::GetWidth() { - return decompress_struct_->image_width; -} - -// Returns height of the last loaded frame. -int MJpegDecoder::GetHeight() { - return decompress_struct_->image_height; -} - -// Returns format of the last loaded frame. The return value is one of the -// kColorSpace* constants. -int MJpegDecoder::GetColorSpace() { - return decompress_struct_->jpeg_color_space; -} - -// Number of color components in the color space. -int MJpegDecoder::GetNumComponents() { - return decompress_struct_->num_components; -} - -// Sample factors of the n-th component. -int MJpegDecoder::GetHorizSampFactor(int component) { - return decompress_struct_->comp_info[component].h_samp_factor; -} - -int MJpegDecoder::GetVertSampFactor(int component) { - return decompress_struct_->comp_info[component].v_samp_factor; -} - -int MJpegDecoder::GetHorizSubSampFactor(int component) { - return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); -} - -int MJpegDecoder::GetVertSubSampFactor(int component) { - return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); -} - -int MJpegDecoder::GetImageScanlinesPerImcuRow() { - return decompress_struct_->max_v_samp_factor * DCTSIZE; -} - -int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { - int vs = GetVertSubSampFactor(component); - return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); -} - -int MJpegDecoder::GetComponentWidth(int component) { - int hs = GetHorizSubSampFactor(component); - return DivideAndRoundUp(GetWidth(), hs); -} - -int MJpegDecoder::GetComponentHeight(int component) { - int vs = GetVertSubSampFactor(component); - return DivideAndRoundUp(GetHeight(), vs); -} - -// Get width in bytes padded out to a multiple of DCTSIZE -int MJpegDecoder::GetComponentStride(int component) { - return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); -} - -int MJpegDecoder::GetComponentSize(int component) { - return GetComponentWidth(component) * GetComponentHeight(component); -} - -LIBYUV_BOOL MJpegDecoder::UnloadFrame() { -#ifdef HAVE_SETJMP - if (setjmp(error_mgr_->setjmp_buffer)) { - // We called jpeg_abort_decompress, it experienced an error, and we called - // longjmp() and rewound the stack to here. Return error. - return LIBYUV_FALSE; - } -#endif - jpeg_abort_decompress(decompress_struct_); - return LIBYUV_TRUE; -} - -// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. -LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, - int dst_width, - int dst_height) { - if (dst_width != GetWidth() || dst_height > GetHeight()) { - // ERROR: Bad dimensions - return LIBYUV_FALSE; - } -#ifdef HAVE_SETJMP - if (setjmp(error_mgr_->setjmp_buffer)) { - // We called into jpeglib, it experienced an error sometime during this - // function call, and we called longjmp() and rewound the stack to here. - // Return error. - return LIBYUV_FALSE; - } -#endif - if (!StartDecode()) { - return LIBYUV_FALSE; - } - SetScanlinePointers(databuf_); - int lines_left = dst_height; - // Compute amount of lines to skip to implement vertical crop. - // TODO(fbarchard): Ensure skip is a multiple of maximum component - // subsample. ie 2 - int skip = (GetHeight() - dst_height) / 2; - if (skip > 0) { - // There is no API to skip lines in the output data, so we read them - // into the temp buffer. - while (skip >= GetImageScanlinesPerImcuRow()) { - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - skip -= GetImageScanlinesPerImcuRow(); - } - if (skip > 0) { - // Have a partial iMCU row left over to skip. Must read it and then - // copy the parts we want into the destination. - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - for (int i = 0; i < num_outbufs_; ++i) { - // TODO(fbarchard): Compute skip to avoid this - assert(skip % GetVertSubSampFactor(i) == 0); - int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int scanlines_to_copy = - GetComponentScanlinesPerImcuRow(i) - rows_to_skip; - int data_to_skip = rows_to_skip * GetComponentStride(i); - CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], - GetComponentWidth(i), GetComponentWidth(i), - scanlines_to_copy); - planes[i] += scanlines_to_copy * GetComponentWidth(i); - } - lines_left -= (GetImageScanlinesPerImcuRow() - skip); - } - } - - // Read full MCUs but cropped horizontally - for (; lines_left > GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - for (int i = 0; i < num_outbufs_; ++i) { - int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); - CopyPlane(databuf_[i], GetComponentStride(i), planes[i], - GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); - planes[i] += scanlines_to_copy * GetComponentWidth(i); - } - } - - if (lines_left > 0) { - // Have a partial iMCU row left over to decode. - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - for (int i = 0; i < num_outbufs_; ++i) { - int scanlines_to_copy = - DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); - CopyPlane(databuf_[i], GetComponentStride(i), planes[i], - GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); - planes[i] += scanlines_to_copy * GetComponentWidth(i); - } - } - return FinishDecode(); -} - -LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, - void* opaque, - int dst_width, - int dst_height) { - if (dst_width != GetWidth() || dst_height > GetHeight()) { - // ERROR: Bad dimensions - return LIBYUV_FALSE; - } -#ifdef HAVE_SETJMP - if (setjmp(error_mgr_->setjmp_buffer)) { - // We called into jpeglib, it experienced an error sometime during this - // function call, and we called longjmp() and rewound the stack to here. - // Return error. - return LIBYUV_FALSE; - } -#endif - if (!StartDecode()) { - return LIBYUV_FALSE; - } - SetScanlinePointers(databuf_); - int lines_left = dst_height; - // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop - int skip = (GetHeight() - dst_height) / 2; - if (skip > 0) { - while (skip >= GetImageScanlinesPerImcuRow()) { - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - skip -= GetImageScanlinesPerImcuRow(); - } - if (skip > 0) { - // Have a partial iMCU row left over to skip. - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - for (int i = 0; i < num_outbufs_; ++i) { - // TODO(fbarchard): Compute skip to avoid this - assert(skip % GetVertSubSampFactor(i) == 0); - int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int data_to_skip = rows_to_skip * GetComponentStride(i); - // Change our own data buffer pointers so we can pass them to the - // callback. - databuf_[i] += data_to_skip; - } - int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; - (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); - // Now change them back. - for (int i = 0; i < num_outbufs_; ++i) { - int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int data_to_skip = rows_to_skip * GetComponentStride(i); - databuf_[i] -= data_to_skip; - } - lines_left -= scanlines_to_copy; - } - } - // Read full MCUs until we get to the crop point. - for (; lines_left >= GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); - } - if (lines_left > 0) { - // Have a partial iMCU row left over to decode. - if (!DecodeImcuRow()) { - FinishDecode(); - return LIBYUV_FALSE; - } - (*fn)(opaque, databuf_, databuf_strides_, lines_left); - } - return FinishDecode(); -} - -void init_source(j_decompress_ptr cinfo) { - fill_input_buffer(cinfo); -} - -boolean fill_input_buffer(j_decompress_ptr cinfo) { - BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); - if (buf_vec->pos >= buf_vec->len) { - // Don't assert-fail when fuzzing. -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - assert(0 && "No more data"); -#endif - // ERROR: No more data - return FALSE; - } - cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; - cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; - ++buf_vec->pos; - return TRUE; -} - -void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT - jpeg_source_mgr* src = cinfo->src; - size_t bytes = static_cast(num_bytes); - if (bytes > src->bytes_in_buffer) { - src->next_input_byte = nullptr; - src->bytes_in_buffer = 0; - } else { - src->next_input_byte += bytes; - src->bytes_in_buffer -= bytes; - } -} - -void term_source(j_decompress_ptr cinfo) { - (void)cinfo; // Nothing to do. -} - -#ifdef HAVE_SETJMP -void ErrorHandler(j_common_ptr cinfo) { -// This is called when a jpeglib command experiences an error. Unfortunately -// jpeglib's error handling model is not very flexible, because it expects the -// error handler to not return--i.e., it wants the program to terminate. To -// recover from errors we use setjmp() as shown in their example. setjmp() is -// C's implementation for the "call with current continuation" functionality -// seen in some functional programming languages. -// A formatted message can be output, but is unsafe for release. -#ifdef DEBUG - char buf[JMSG_LENGTH_MAX]; - (*cinfo->err->format_message)(cinfo, buf); -// ERROR: Error in jpeglib: buf -#endif - - SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); - // This rewinds the call stack to the point of the corresponding setjmp() - // and causes it to return (for a second time) with value 1. - longjmp(mgr->setjmp_buffer, 1); -} - -// Suppress fprintf warnings. -void OutputHandler(j_common_ptr cinfo) { - (void)cinfo; -} - -#endif // HAVE_SETJMP - -void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { - if (num_outbufs != num_outbufs_) { - // We could perhaps optimize this case to resize the output buffers without - // necessarily having to delete and recreate each one, but it's not worth - // it. - DestroyOutputBuffers(); - - scanlines_ = new uint8_t**[num_outbufs]; - scanlines_sizes_ = new int[num_outbufs]; - databuf_ = new uint8_t*[num_outbufs]; - databuf_strides_ = new int[num_outbufs]; - - for (int i = 0; i < num_outbufs; ++i) { - scanlines_[i] = NULL; - scanlines_sizes_[i] = 0; - databuf_[i] = NULL; - databuf_strides_[i] = 0; - } - - num_outbufs_ = num_outbufs; - } -} - -void MJpegDecoder::DestroyOutputBuffers() { - for (int i = 0; i < num_outbufs_; ++i) { - delete[] scanlines_[i]; - delete[] databuf_[i]; - } - delete[] scanlines_; - delete[] databuf_; - delete[] scanlines_sizes_; - delete[] databuf_strides_; - scanlines_ = NULL; - databuf_ = NULL; - scanlines_sizes_ = NULL; - databuf_strides_ = NULL; - num_outbufs_ = 0; -} - -// JDCT_IFAST and do_block_smoothing improve performance substantially. -LIBYUV_BOOL MJpegDecoder::StartDecode() { - decompress_struct_->raw_data_out = TRUE; - decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default - decompress_struct_->dither_mode = JDITHER_NONE; - // Not applicable to 'raw': - decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); - // Only for buffered mode: - decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); - // Blocky but fast: - decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); - - if (!jpeg_start_decompress(decompress_struct_)) { - // ERROR: Couldn't start JPEG decompressor"; - return LIBYUV_FALSE; - } - return LIBYUV_TRUE; -} - -LIBYUV_BOOL MJpegDecoder::FinishDecode() { - // jpeglib considers it an error if we finish without decoding the whole - // image, so we call "abort" rather than "finish". - jpeg_abort_decompress(decompress_struct_); - return LIBYUV_TRUE; -} - -void MJpegDecoder::SetScanlinePointers(uint8_t** data) { - for (int i = 0; i < num_outbufs_; ++i) { - uint8_t* data_i = data[i]; - for (int j = 0; j < scanlines_sizes_[i]; ++j) { - scanlines_[i][j] = data_i; - data_i += GetComponentStride(i); - } - } -} - -inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { - return (unsigned int)(GetImageScanlinesPerImcuRow()) == - jpeg_read_raw_data(decompress_struct_, scanlines_, - GetImageScanlinesPerImcuRow()); -} - -// The helper function which recognizes the jpeg sub-sampling type. -JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( - int* subsample_x, - int* subsample_y, - int number_of_components) { - if (number_of_components == 3) { // Color images. - if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && - subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { - return kJpegYuv420; - } - if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && - subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { - return kJpegYuv422; - } - if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && - subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { - return kJpegYuv444; - } - } else if (number_of_components == 1) { // Grey-scale images. - if (subsample_x[0] == 1 && subsample_y[0] == 1) { - return kJpegYuv400; - } - } - return kJpegUnknown; -} - -} // namespace libyuv -#endif // HAVE_JPEG diff --git a/thirdparty/libyuv/source/mjpeg_validate.cc b/thirdparty/libyuv/source/mjpeg_validate.cc deleted file mode 100644 index ba0a03a..0000000 --- a/thirdparty/libyuv/source/mjpeg_validate.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/mjpeg_decoder.h" - -#include // For memchr. - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Helper function to scan for EOI marker (0xff 0xd9). -static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) { - if (src_size_mjpg >= 2) { - const uint8_t* end = src_mjpg + src_size_mjpg - 1; - const uint8_t* it = src_mjpg; - while (it < end) { - // TODO(fbarchard): scan for 0xd9 instead. - it = (const uint8_t*)(memchr(it, 0xff, end - it)); - if (it == NULL) { - break; - } - if (it[1] == 0xd9) { - return LIBYUV_TRUE; // Success: Valid jpeg. - } - ++it; // Skip over current 0xff. - } - } - // ERROR: Invalid jpeg end code not found. Size src_size_mjpg - return LIBYUV_FALSE; -} - -// Helper function to validate the jpeg appears intact. -LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) { - // Maximum size that ValidateJpeg will consider valid. - const size_t kMaxJpegSize = 0x7fffffffull; - const size_t kBackSearchSize = 1024; - if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) { - // ERROR: Invalid jpeg size: src_size_mjpg - return LIBYUV_FALSE; - } - // SOI marker - if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) { - // ERROR: Invalid jpeg initial start code - return LIBYUV_FALSE; - } - - // Look for the End Of Image (EOI) marker near the end of the buffer. - if (src_size_mjpg > kBackSearchSize) { - if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) { - return LIBYUV_TRUE; // Success: Valid jpeg. - } - // Reduce search size for forward search. - src_size_mjpg = src_size_mjpg - kBackSearchSize + 1; - } - // Step over SOI marker and scan for EOI. - return ScanEOI(src_mjpg + 2, src_size_mjpg - 2); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/planar_functions.cc b/thirdparty/libyuv/source/planar_functions.cc deleted file mode 100644 index 7cea06c..0000000 --- a/thirdparty/libyuv/source/planar_functions.cc +++ /dev/null @@ -1,5063 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/planar_functions.h" - -#include -#include // for memset() - -#include "libyuv/cpu_id.h" -#ifdef HAVE_JPEG -#include "libyuv/mjpeg_decoder.h" -#endif -#include "libyuv/row.h" -#include "libyuv/scale_row.h" // for ScaleRowDown2 - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Copy a plane of data -LIBYUV_API -void CopyPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - // Nothing to do. - if (src_y == dst_y && src_stride_y == dst_stride_y) { - return; - } - -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; - } -#endif - - // Copy plane - for (y = 0; y < height; ++y) { - CopyRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// TODO(fbarchard): Consider support for negative height. -// TODO(fbarchard): Consider stride measured in bytes. -LIBYUV_API -void CopyPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } -#if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_16_SSE2; - } -#endif -#if defined(HAS_COPYROW_16_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_16_ERMS; - } -#endif -#if defined(HAS_COPYROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_16_NEON; - } -#endif - - // Copy plane - for (y = 0; y < height; ++y) { - CopyRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Convert a plane of 16 bit data to 8 bit -LIBYUV_API -void Convert16To8Plane(const uint16_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int scale, // 16384 for 10 bits - int width, - int height) { - int y; - void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, - int width) = Convert16To8Row_C; - - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } -#if defined(HAS_CONVERT16TO8ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - Convert16To8Row = Convert16To8Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - Convert16To8Row = Convert16To8Row_SSSE3; - } - } -#endif -#if defined(HAS_CONVERT16TO8ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - Convert16To8Row = Convert16To8Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - Convert16To8Row = Convert16To8Row_AVX2; - } - } -#endif - - // Convert plane - for (y = 0; y < height; ++y) { - Convert16To8Row(src_y, dst_y, scale, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Convert a plane of 8 bit data to 16 bit -LIBYUV_API -void Convert8To16Plane(const uint8_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int scale, // 16384 for 10 bits - int width, - int height) { - int y; - void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, - int width) = Convert8To16Row_C; - - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } -#if defined(HAS_CONVERT8TO16ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - Convert8To16Row = Convert8To16Row_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - Convert8To16Row = Convert8To16Row_SSE2; - } - } -#endif -#if defined(HAS_CONVERT8TO16ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - Convert8To16Row = Convert8To16Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - Convert8To16Row = Convert8To16Row_AVX2; - } - } -#endif - - // Convert plane - for (y = 0; y < height; ++y) { - Convert8To16Row(src_y, dst_y, scale, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Copy I422. -LIBYUV_API -int I422Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); - return 0; -} - -// Copy I444. -LIBYUV_API -int I444Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); - return 0; -} - -// Copy I400. -LIBYUV_API -int I400ToI400(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - if (!src_y || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - return 0; -} - -// Convert I420 to I400. -LIBYUV_API -int I420ToI400(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - (void)src_u; - (void)src_stride_u; - (void)src_v; - (void)src_stride_v; - if (!src_y || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - return 0; -} - -// Copy NV12. Supports inverting. -int NV12Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { - return -1; - } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_uv = src_uv + (halfheight - 1) * src_stride_uv; - src_stride_y = -src_stride_y; - src_stride_uv = -src_stride_uv; - } - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, - halfheight); - return 0; -} - -// Copy NV21. Supports inverting. -int NV21Copy(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, - dst_stride_y, dst_vu, dst_stride_vu, width, height); -} - -// Support function for NV12 etc UV channels. -// Width and height are plane sizes (typically half pixel width). -LIBYUV_API -void SplitUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, - int width) = SplitUVRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - // Coalesce rows. - if (src_stride_uv == width * 2 && dst_stride_u == width && - dst_stride_v == width) { - width *= height; - height = 1; - src_stride_uv = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_SPLITUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_NEON; - } - } -#endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; - } - } -#endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Copy a row of UV. - SplitUVRow(src_uv, dst_u, dst_v, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - src_uv += src_stride_uv; - } -} - -LIBYUV_API -void MergeUVPlane(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_uv, int width) = MergeUVRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_uv = dst_uv + (height - 1) * dst_stride_uv; - dst_stride_uv = -dst_stride_uv; - } - // Coalesce rows. - if (src_stride_u == width && src_stride_v == width && - dst_stride_uv == width * 2) { - width *= height; - height = 1; - src_stride_u = src_stride_v = dst_stride_uv = 0; - } -#if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MergeUVRow = MergeUVRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow = MergeUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_NEON; - } - } -#endif -#if defined(HAS_MERGEUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeUVRow = MergeUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MergeUVRow = MergeUVRow_MMI; - } - } -#endif -#if defined(HAS_MERGEUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MergeUVRow = MergeUVRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Merge a row of U and V into a row of UV. - MergeUVRow(src_u, src_v, dst_uv, width); - src_u += src_stride_u; - src_v += src_stride_v; - dst_uv += dst_stride_uv; - } -} - -// Support function for P010 etc UV channels. -// Width and height are plane sizes (typically half pixel width). -LIBYUV_API -void SplitUVPlane_16(const uint16_t* src_uv, - int src_stride_uv, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int width, - int height, - int depth) { - int y; - void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, - uint16_t* dst_v, int depth, int width) = - SplitUVRow_16_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - // Coalesce rows. - if (src_stride_uv == width * 2 && dst_stride_u == width && - dst_stride_v == width) { - width *= height; - height = 1; - src_stride_uv = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_SPLITUVROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow_16 = SplitUVRow_16_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow_16 = SplitUVRow_16_AVX2; - } - } -#endif -#if defined(HAS_SPLITUVROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow_16 = SplitUVRow_16_Any_NEON; - if (IS_ALIGNED(width, 8)) { - SplitUVRow_16 = SplitUVRow_16_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Copy a row of UV. - SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); - dst_u += dst_stride_u; - dst_v += dst_stride_v; - src_uv += src_stride_uv; - } -} - -LIBYUV_API -void MergeUVPlane_16(const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - uint16_t* dst_uv, - int dst_stride_uv, - int width, - int height, - int depth) { - int y; - void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, - uint16_t* dst_uv, int depth, int width) = - MergeUVRow_16_C; - assert(depth >= 8); - assert(depth <= 16); - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_uv = dst_uv + (height - 1) * dst_stride_uv; - dst_stride_uv = -dst_stride_uv; - } - // Coalesce rows. - if (src_stride_u == width && src_stride_v == width && - dst_stride_uv == width * 2) { - width *= height; - height = 1; - src_stride_u = src_stride_v = dst_stride_uv = 0; - } -#if defined(HAS_MERGEUVROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow_16 = MergeUVRow_16_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeUVRow_16 = MergeUVRow_16_AVX2; - } - } -#endif -#if defined(HAS_MERGEUVROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow_16 = MergeUVRow_16_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeUVRow_16 = MergeUVRow_16_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Merge a row of U and V into a row of UV. - MergeUVRow_16(src_u, src_v, dst_uv, depth, width); - src_u += src_stride_u; - src_v += src_stride_v; - dst_uv += dst_stride_uv; - } -} - -// Convert plane from lsb to msb -LIBYUV_API -void ConvertToMSBPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height, - int depth) { - int y; - int scale = 1 << (16 - depth); - void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, - int width) = MultiplyRow_16_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - -#if defined(HAS_MULTIPLYROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MultiplyRow_16 = MultiplyRow_16_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MultiplyRow_16 = MultiplyRow_16_AVX2; - } - } -#endif -#if defined(HAS_MULTIPLYROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MultiplyRow_16 = MultiplyRow_16_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MultiplyRow_16 = MultiplyRow_16_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MultiplyRow_16(src_y, dst_y, scale, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Convert plane from msb to lsb -LIBYUV_API -void ConvertToLSBPlane_16(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - int width, - int height, - int depth) { - int y; - int scale = 1 << depth; - void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, - int width) = DivideRow_16_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - -#if defined(HAS_DIVIDEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - DivideRow = DivideRow_16_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - DivideRow = DivideRow_16_AVX2; - } - } -#endif -#if defined(HAS_DIVIDEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - DivideRow = DivideRow_16_Any_NEON; - if (IS_ALIGNED(width, 16)) { - DivideRow = DivideRow_16_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - DivideRow(src_y, dst_y, scale, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Swap U and V channels in interleaved UV plane. -LIBYUV_API -void SwapUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - int y; - void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = - SwapUVRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uv = src_uv + (height - 1) * src_stride_uv; - src_stride_uv = -src_stride_uv; - } - // Coalesce rows. - if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) { - width *= height; - height = 1; - src_stride_uv = dst_stride_vu = 0; - } - -#if defined(HAS_SWAPUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SwapUVRow = SwapUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - SwapUVRow = SwapUVRow_SSSE3; - } - } -#endif -#if defined(HAS_SWAPUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SwapUVRow = SwapUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - SwapUVRow = SwapUVRow_AVX2; - } - } -#endif -#if defined(HAS_SWAPUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SwapUVRow = SwapUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SwapUVRow = SwapUVRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - SwapUVRow(src_uv, dst_vu, width); - src_uv += src_stride_uv; - dst_vu += dst_stride_vu; - } -} - -// Convert NV21 to NV12. -LIBYUV_API -int NV21ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_vu, - int src_stride_vu, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_vu || !dst_uv || width <= 0 || height == 0) { - return -1; - } - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_vu = src_vu + (halfheight - 1) * src_stride_vu; - src_stride_vu = -src_stride_vu; - } - - SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, - halfheight); - return 0; -} - -// Support function for NV12 etc RGB channels. -// Width and height are plane sizes (typically half pixel width). -LIBYUV_API -void SplitRGBPlane(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int y; - void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, - uint8_t* dst_b, int width) = SplitRGBRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_r = dst_r + (height - 1) * dst_stride_r; - dst_g = dst_g + (height - 1) * dst_stride_g; - dst_b = dst_b + (height - 1) * dst_stride_b; - dst_stride_r = -dst_stride_r; - dst_stride_g = -dst_stride_g; - dst_stride_b = -dst_stride_b; - } - // Coalesce rows. - if (src_stride_rgb == width * 3 && dst_stride_r == width && - dst_stride_g == width && dst_stride_b == width) { - width *= height; - height = 1; - src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; - } -#if defined(HAS_SPLITRGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SplitRGBRow = SplitRGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_SSSE3; - } - } -#endif -#if defined(HAS_SPLITRGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitRGBRow = SplitRGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - SplitRGBRow = SplitRGBRow_MMI; - } - } -#endif -#if defined(HAS_SPLITRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitRGBRow = SplitRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Copy a row of RGB. - SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); - dst_r += dst_stride_r; - dst_g += dst_stride_g; - dst_b += dst_stride_b; - src_rgb += src_stride_rgb; - } -} - -LIBYUV_API -void MergeRGBPlane(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - uint8_t* dst_rgb, - int dst_stride_rgb, - int width, - int height) { - int y; - void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, - const uint8_t* src_b, uint8_t* dst_rgb, int width) = - MergeRGBRow_C; - // Coalesce rows. - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; - dst_stride_rgb = -dst_stride_rgb; - } - // Coalesce rows. - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - dst_stride_rgb == width * 3) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; - } -#if defined(HAS_MERGERGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MergeRGBRow = MergeRGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - MergeRGBRow = MergeRGBRow_SSSE3; - } - } -#endif -#if defined(HAS_MERGERGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeRGBRow = MergeRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeRGBRow = MergeRGBRow_NEON; - } - } -#endif -#if defined(HAS_MERGERGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MergeRGBRow = MergeRGBRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MergeRGBRow = MergeRGBRow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - // Merge a row of U and V into a row of RGB. - MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_rgb += dst_stride_rgb; - } -} - -LIBYUV_NOINLINE -void SplitARGBPlaneAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height) { - int y; - void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, - uint8_t* dst_b, uint8_t* dst_a, int width) = - SplitARGBRow_C; - - assert(height > 0); - - if (src_stride_argb == width * 4 && dst_stride_r == width && - dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = - dst_stride_a = 0; - } - -#if defined(HAS_SPLITARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitARGBRow = SplitARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SplitARGBRow = SplitARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_SPLITARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitARGBRow = SplitARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - SplitARGBRow = SplitARGBRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitARGBRow = SplitARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitARGBRow = SplitARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); - dst_r += dst_stride_r; - dst_g += dst_stride_g; - dst_b += dst_stride_b; - dst_a += dst_stride_a; - src_argb += src_stride_argb; - } -} - -LIBYUV_NOINLINE -void SplitARGBPlaneOpaque(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int y; - void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, - uint8_t* dst_b, int width) = SplitXRGBRow_C; - assert(height > 0); - - if (src_stride_argb == width * 4 && dst_stride_r == width && - dst_stride_g == width && dst_stride_b == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; - } - -#if defined(HAS_SPLITXRGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitXRGBRow = SplitXRGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITXRGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SplitXRGBRow = SplitXRGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSSE3; - } - } -#endif -#if defined(HAS_SPLITXRGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitXRGBRow = SplitXRGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - SplitXRGBRow = SplitXRGBRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITXRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitXRGBRow = SplitXRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitXRGBRow = SplitXRGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); - dst_r += dst_stride_r; - dst_g += dst_stride_g; - dst_b += dst_stride_b; - src_argb += src_stride_argb; - } -} - -LIBYUV_API -void SplitARGBPlane(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_r = dst_r + (height - 1) * dst_stride_r; - dst_g = dst_g + (height - 1) * dst_stride_g; - dst_b = dst_b + (height - 1) * dst_stride_b; - dst_a = dst_a + (height - 1) * dst_stride_a; - dst_stride_r = -dst_stride_r; - dst_stride_g = -dst_stride_g; - dst_stride_b = -dst_stride_b; - dst_stride_a = -dst_stride_a; - } - - if (dst_a == NULL) { - SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, - dst_stride_g, dst_b, dst_stride_b, width, height); - } else { - SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, - dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a, - width, height); - } -} - -LIBYUV_NOINLINE -void MergeARGBPlaneAlpha(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, - const uint8_t* src_b, const uint8_t* src_a, - uint8_t* dst_argb, int width) = MergeARGBRow_C; - - assert(height > 0); - - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - src_stride_a == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = src_stride_a = - dst_stride_argb = 0; - } -#if defined(HAS_MERGEARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeARGBRow = MergeARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - MergeARGBRow = MergeARGBRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeARGBRow = MergeARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeARGBRow = MergeARGBRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeARGBRow = MergeARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeARGBRow = MergeARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - src_a += src_stride_a; - dst_argb += dst_stride_argb; - } -} - -LIBYUV_NOINLINE -void MergeARGBPlaneOpaque(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, - const uint8_t* src_b, uint8_t* dst_argb, int width) = - MergeXRGBRow_C; - - assert(height > 0); - - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; - } -#if defined(HAS_MERGEXRGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeXRGBRow = MergeXRGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - MergeXRGBRow = MergeXRGBRow_SSE2; - } - } -#endif -#if defined(HAS_MERGEXRGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeXRGBRow = MergeXRGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeXRGBRow = MergeXRGBRow_AVX2; - } - } -#endif -#if defined(HAS_MERGEXRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeXRGBRow = MergeXRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeXRGBRow = MergeXRGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_argb += dst_stride_argb; - } -} - -LIBYUV_API -void MergeARGBPlane(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - - if (src_a == NULL) { - MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, dst_argb, dst_stride_argb, width, - height); - } else { - MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, src_a, src_stride_a, dst_argb, - dst_stride_argb, width, height); - } -} - -// TODO(yuan): Support 2 bit alpha channel. -LIBYUV_API -void MergeXR30Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - uint8_t* dst_ar30, - int dst_stride_ar30, - int width, - int height, - int depth) { - int y; - void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, - const uint16_t* src_b, uint8_t* dst_ar30, int depth, - int width) = MergeXR30Row_C; - - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; - dst_stride_ar30 = -dst_stride_ar30; - } - // Coalesce rows. - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - dst_stride_ar30 == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; - } -#if defined(HAS_MERGEXR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeXR30Row = MergeXR30Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeXR30Row = MergeXR30Row_AVX2; - } - } -#endif -#if defined(HAS_MERGEXR30ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (depth == 10) { - MergeXR30Row = MergeXR30Row_10_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeXR30Row = MergeXR30Row_10_NEON; - } - } else { - MergeXR30Row = MergeXR30Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeXR30Row = MergeXR30Row_NEON; - } - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_ar30 += dst_stride_ar30; - } -} - -LIBYUV_NOINLINE -static void MergeAR64PlaneAlpha(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height, - int depth) { - int y; - void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, - const uint16_t* src_b, const uint16_t* src_a, - uint16_t* dst_argb, int depth, int width) = - MergeAR64Row_C; - - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - src_stride_a == width && dst_stride_ar64 == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = src_stride_a = - dst_stride_ar64 = 0; - } -#if defined(HAS_MERGEAR64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeAR64Row = MergeAR64Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeAR64Row = MergeAR64Row_AVX2; - } - } -#endif -#if defined(HAS_MERGEAR64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeAR64Row = MergeAR64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeAR64Row = MergeAR64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - src_a += src_stride_a; - dst_ar64 += dst_stride_ar64; - } -} - -LIBYUV_NOINLINE -static void MergeAR64PlaneOpaque(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height, - int depth) { - int y; - void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, - const uint16_t* src_b, uint16_t* dst_argb, int depth, - int width) = MergeXR64Row_C; - - // Coalesce rows. - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - dst_stride_ar64 == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; - } -#if defined(HAS_MERGEXR64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeXR64Row = MergeXR64Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeXR64Row = MergeXR64Row_AVX2; - } - } -#endif -#if defined(HAS_MERGEXR64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeXR64Row = MergeXR64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeXR64Row = MergeXR64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_ar64 += dst_stride_ar64; - } -} - -LIBYUV_API -void MergeAR64Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height, - int depth) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; - dst_stride_ar64 = -dst_stride_ar64; - } - - if (src_a == NULL) { - MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, dst_ar64, dst_stride_ar64, width, height, - depth); - } else { - MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, src_a, src_stride_a, dst_ar64, - dst_stride_ar64, width, height, depth); - } -} - -LIBYUV_NOINLINE -static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int depth) { - int y; - void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, - const uint16_t* src_b, const uint16_t* src_a, - uint8_t* dst_argb, int depth, int width) = - MergeARGB16To8Row_C; - - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - src_stride_a == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = src_stride_a = - dst_stride_argb = 0; - } -#if defined(HAS_MERGEARGB16TO8ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeARGB16To8Row = MergeARGB16To8Row_AVX2; - } - } -#endif -#if defined(HAS_MERGEARGB16TO8ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeARGB16To8Row = MergeARGB16To8Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - src_a += src_stride_a; - dst_argb += dst_stride_argb; - } -} - -LIBYUV_NOINLINE -static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int depth) { - int y; - void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, - const uint16_t* src_b, uint8_t* dst_argb, int depth, - int width) = MergeXRGB16To8Row_C; - - // Coalesce rows. - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; - } -#if defined(HAS_MERGEXRGB16TO8ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; - } - } -#endif -#if defined(HAS_MERGEXRGB16TO8ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_argb += dst_stride_argb; - } -} - -LIBYUV_API -void MergeARGB16To8Plane(const uint16_t* src_r, - int src_stride_r, - const uint16_t* src_g, - int src_stride_g, - const uint16_t* src_b, - int src_stride_b, - const uint16_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int depth) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - - if (src_a == NULL) { - MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, dst_argb, dst_stride_argb, width, - height, depth); - } else { - MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, - src_stride_b, src_a, src_stride_a, dst_argb, - dst_stride_argb, width, height, depth); - } -} - -// Convert YUY2 to I422. -LIBYUV_API -int YUY2ToI422(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, - uint8_t* dst_v, int width) = YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = - YUY2ToYRow_C; - if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; - } - // Coalesce rows. - if (src_stride_yuy2 == width * 2 && dst_stride_y == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width && - width * height <= 32768) { - width *= height; - height = 1; - src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; - YUY2ToYRow = YUY2ToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - YUY2ToUV422Row = YUY2ToUV422Row_AVX2; - YUY2ToYRow = YUY2ToYRow_AVX2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_NEON; - YUY2ToUV422Row = YUY2ToUV422Row_NEON; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; - YUY2ToUV422Row = YUY2ToUV422Row_MMI; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToYRow = YUY2ToYRow_Any_MSA; - YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_MSA; - YUY2ToUV422Row = YUY2ToUV422Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); - YUY2ToYRow(src_yuy2, dst_y, width); - src_yuy2 += src_stride_yuy2; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// Convert UYVY to I422. -LIBYUV_API -int UYVYToI422(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, - uint8_t* dst_v, int width) = UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = - UYVYToYRow_C; - if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; - src_stride_uyvy = -src_stride_uyvy; - } - // Coalesce rows. - if (src_stride_uyvy == width * 2 && dst_stride_y == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width && - width * height <= 32768) { - width *= height; - height = 1; - src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; - } - } -#endif -#if defined(HAS_UYVYTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; - UYVYToYRow = UYVYToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - UYVYToUV422Row = UYVYToUV422Row_AVX2; - UYVYToYRow = UYVYToYRow_AVX2; - } - } -#endif -#if defined(HAS_UYVYTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - UYVYToYRow = UYVYToYRow_Any_NEON; - UYVYToUV422Row = UYVYToUV422Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_NEON; - UYVYToUV422Row = UYVYToUV422Row_NEON; - } - } -#endif -#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - UYVYToYRow = UYVYToYRow_Any_MMI; - UYVYToUV422Row = UYVYToUV422Row_Any_MMI; - if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_MMI; - UYVYToUV422Row = UYVYToUV422Row_MMI; - } - } -#endif -#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - UYVYToYRow = UYVYToYRow_Any_MSA; - UYVYToUV422Row = UYVYToUV422Row_Any_MSA; - if (IS_ALIGNED(width, 32)) { - UYVYToYRow = UYVYToYRow_MSA; - UYVYToUV422Row = UYVYToUV422Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); - UYVYToYRow(src_uyvy, dst_y, width); - src_uyvy += src_stride_uyvy; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// Convert YUY2 to Y. -LIBYUV_API -int YUY2ToY(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = - YUY2ToYRow_C; - if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; - } - // Coalesce rows. - if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_yuy2 = dst_stride_y = 0; - } -#if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - YUY2ToYRow = YUY2ToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_AVX2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_NEON; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - YUY2ToYRow = YUY2ToYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_MMI; - } - } -#endif -#if defined(HAS_YUY2TOYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - YUY2ToYRow = YUY2ToYRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - YUY2ToYRow = YUY2ToYRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - YUY2ToYRow(src_yuy2, dst_y, width); - src_yuy2 += src_stride_yuy2; - dst_y += dst_stride_y; - } - return 0; -} - -// Mirror a plane of data. -// See Also I400Mirror -LIBYUV_API -void MirrorPlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } -#if defined(HAS_MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_NEON; - } - } -#endif -#if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; - } - } -#endif -#if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MirrorRow = MirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; - } - } -#endif -#if defined(HAS_MIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MirrorRow = MirrorRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MirrorRow = MirrorRow_MMI; - } - } -#endif -#if defined(HAS_MIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorRow = MirrorRow_Any_MSA; - if (IS_ALIGNED(width, 64)) { - MirrorRow = MirrorRow_MSA; - } - } -#endif - - // Mirror plane - for (y = 0; y < height; ++y) { - MirrorRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} - -// Mirror a plane of UV data. -LIBYUV_API -void MirrorUVPlane(const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = - MirrorUVRow_C; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uv = src_uv + (height - 1) * src_stride_uv; - src_stride_uv = -src_stride_uv; - } -#if defined(HAS_MIRRORUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MirrorUVRow = MirrorUVRow_Any_NEON; - if (IS_ALIGNED(width, 32)) { - MirrorUVRow = MirrorUVRow_NEON; - } - } -#endif -#if defined(HAS_MIRRORUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorUVRow = MirrorUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_SSSE3; - } - } -#endif -#if defined(HAS_MIRRORUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MirrorUVRow = MirrorUVRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MirrorUVRow = MirrorUVRow_AVX2; - } - } -#endif -#if defined(HAS_MIRRORUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorUVRow = MirrorUVRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_MSA; - } - } -#endif - - // MirrorUV plane - for (y = 0; y < height; ++y) { - MirrorUVRow(src_uv, dst_uv, width); - src_uv += src_stride_uv; - dst_uv += dst_stride_uv; - } -} - -// Mirror I400 with optional flipping -LIBYUV_API -int I400Mirror(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - if (!src_y || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - - MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - return 0; -} - -// Mirror I420 with optional flipping -LIBYUV_API -int I420Mirror(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; -} - -// NV12 mirror. -LIBYUV_API -int NV12Mirror(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_uv = src_uv + (halfheight - 1) * src_stride_uv; - src_stride_y = -src_stride_y; - src_stride_uv = -src_stride_uv; - } - - if (dst_y) { - MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, - halfheight); - return 0; -} - -// ARGB mirror. -LIBYUV_API -int ARGBMirror(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = - ARGBMirrorRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBMIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_NEON; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMirrorRow = ARGBMirrorRow_MMI; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBMirrorRow = ARGBMirrorRow_MSA; - } - } -#endif - - // Mirror plane - for (y = 0; y < height; ++y) { - ARGBMirrorRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// RGB24 mirror. -LIBYUV_API -int RGB24Mirror(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - int y; - void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = - RGB24MirrorRow_C; - if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; - src_stride_rgb24 = -src_stride_rgb24; - } -#if defined(HAS_RGB24MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24MirrorRow = RGB24MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24MirrorRow = RGB24MirrorRow_NEON; - } - } -#endif -#if defined(HAS_RGB24MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24MirrorRow = RGB24MirrorRow_SSSE3; - } - } -#endif - - // Mirror plane - for (y = 0; y < height; ++y) { - RGB24MirrorRow(src_rgb24, dst_rgb24, width); - src_rgb24 += src_stride_rgb24; - dst_rgb24 += dst_stride_rgb24; - } - return 0; -} - -// Get a blender that optimized for the CPU and pixel count. -// As there are 6 blenders to choose from, the caller should try to use -// the same blend function for all pixels if possible. -LIBYUV_API -ARGBBlendRow GetARGBBlend() { - void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, - uint8_t* dst_argb, int width) = ARGBBlendRow_C; -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; - return ARGBBlendRow; - } -#endif -#if defined(HAS_ARGBBLENDROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBBlendRow = ARGBBlendRow_NEON; - } -#endif -#if defined(HAS_ARGBBLENDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBBlendRow = ARGBBlendRow_MMI; - } -#endif -#if defined(HAS_ARGBBLENDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBBlendRow = ARGBBlendRow_MSA; - } -#endif - return ARGBBlendRow; -} - -// Alpha Blend 2 ARGB images and store to destination. -LIBYUV_API -int ARGBBlend(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, - uint8_t* dst_argb, int width) = GetARGBBlend(); - if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; - } - - for (y = 0; y < height; ++y) { - ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Alpha Blend plane and store to destination. -LIBYUV_API -int BlendPlane(const uint8_t* src_y0, - int src_stride_y0, - const uint8_t* src_y1, - int src_stride_y1, - const uint8_t* alpha, - int alpha_stride, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, - const uint8_t* alpha, uint8_t* dst, int width) = - BlendPlaneRow_C; - if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - - // Coalesce rows for Y plane. - if (src_stride_y0 == width && src_stride_y1 == width && - alpha_stride == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; - } - -#if defined(HAS_BLENDPLANEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; - } - } -#endif -#if defined(HAS_BLENDPLANEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - BlendPlaneRow = BlendPlaneRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - BlendPlaneRow = BlendPlaneRow_AVX2; - } - } -#endif -#if defined(HAS_BLENDPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BlendPlaneRow = BlendPlaneRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - BlendPlaneRow = BlendPlaneRow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); - src_y0 += src_stride_y0; - src_y1 += src_stride_y1; - alpha += alpha_stride; - dst_y += dst_stride_y; - } - return 0; -} - -#define MAXTWIDTH 2048 -// Alpha Blend YUV images and store to destination. -LIBYUV_API -int I420Blend(const uint8_t* src_y0, - int src_stride_y0, - const uint8_t* src_u0, - int src_stride_u0, - const uint8_t* src_v0, - int src_stride_v0, - const uint8_t* src_y1, - int src_stride_y1, - const uint8_t* src_u1, - int src_stride_u1, - const uint8_t* src_v1, - int src_stride_v1, - const uint8_t* alpha, - int alpha_stride, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { - int y; - // Half width/height for UV. - int halfwidth = (width + 1) >> 1; - void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, - const uint8_t* alpha, uint8_t* dst, int width) = - BlendPlaneRow_C; - void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; - if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || - !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - - // Blend Y plane. - BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, - dst_y, dst_stride_y, width, height); - -#if defined(HAS_BLENDPLANEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; - if (IS_ALIGNED(halfwidth, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; - } - } -#endif -#if defined(HAS_BLENDPLANEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - BlendPlaneRow = BlendPlaneRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - BlendPlaneRow = BlendPlaneRow_AVX2; - } - } -#endif -#if defined(HAS_BLENDPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - BlendPlaneRow = BlendPlaneRow_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - BlendPlaneRow = BlendPlaneRow_MMI; - } - } -#endif - if (!IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_C; - } -#if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON; - if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; - if (IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_NEON; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; - if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; - if (IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_SSSE3; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2; - if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { - ScaleRowDown2 = ScaleRowDown2Box_AVX2; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI; - if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_MMI; - if (IS_ALIGNED(halfwidth, 8)) { - ScaleRowDown2 = ScaleRowDown2Box_MMI; - } - } - } -#endif - - // Row buffer for intermediate alpha pixels. - align_buffer_64(halfalpha, halfwidth); - for (y = 0; y < height; y += 2) { - // last row of odd height image use 1 row of alpha instead of 2. - if (y == (height - 1)) { - alpha_stride = 0; - } - // Subsample 2 rows of UV to half width and half height. - ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth); - alpha += alpha_stride * 2; - BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth); - BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth); - src_u0 += src_stride_u0; - src_u1 += src_stride_u1; - dst_u += dst_stride_u; - src_v0 += src_stride_v0; - src_v1 += src_stride_v1; - dst_v += dst_stride_v; - } - free_aligned_buffer_64(halfalpha); - return 0; -} - -// Multiply 2 ARGB images and store to destination. -LIBYUV_API -int ARGBMultiply(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, - uint8_t* dst, int width) = ARGBMultiplyRow_C; - if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; - } -#if defined(HAS_ARGBMULTIPLYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBMULTIPLYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBMultiplyRow = ARGBMultiplyRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBMULTIPLYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBMultiplyRow = ARGBMultiplyRow_NEON; - } - } -#endif -#if defined(HAS_ARGBMULTIPLYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMultiplyRow = ARGBMultiplyRow_MMI; - } - } -#endif -#if defined(HAS_ARGBMULTIPLYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; - if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_MSA; - } - } -#endif - - // Multiply plane - for (y = 0; y < height; ++y) { - ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Add 2 ARGB images and store to destination. -LIBYUV_API -int ARGBAdd(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, - int width) = ARGBAddRow_C; - if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; - } -#if defined(HAS_ARGBADDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_SSE2; - } -#endif -#if defined(HAS_ARGBADDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBAddRow = ARGBAddRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBADDROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAddRow = ARGBAddRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAddRow = ARGBAddRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBADDROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAddRow = ARGBAddRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAddRow = ARGBAddRow_NEON; - } - } -#endif -#if defined(HAS_ARGBADDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAddRow = ARGBAddRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAddRow = ARGBAddRow_MMI; - } - } -#endif -#if defined(HAS_ARGBADDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAddRow = ARGBAddRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAddRow = ARGBAddRow_MSA; - } - } -#endif - - // Add plane - for (y = 0; y < height; ++y) { - ARGBAddRow(src_argb0, src_argb1, dst_argb, width); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Subtract 2 ARGB images and store to destination. -LIBYUV_API -int ARGBSubtract(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, - uint8_t* dst, int width) = ARGBSubtractRow_C; - if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; - } -#if defined(HAS_ARGBSUBTRACTROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBSubtractRow = ARGBSubtractRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBSUBTRACTROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBSubtractRow = ARGBSubtractRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBSUBTRACTROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBSubtractRow = ARGBSubtractRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBSubtractRow = ARGBSubtractRow_NEON; - } - } -#endif -#if defined(HAS_ARGBSUBTRACTROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBSubtractRow = ARGBSubtractRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBSubtractRow = ARGBSubtractRow_MMI; - } - } -#endif -#if defined(HAS_ARGBSUBTRACTROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBSubtractRow = ARGBSubtractRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBSubtractRow = ARGBSubtractRow_MSA; - } - } -#endif - - // Subtract plane - for (y = 0; y < height; ++y) { - ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); - src_argb0 += src_stride_argb0; - src_argb1 += src_stride_argb1; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert RAW to RGB24. -LIBYUV_API -int RAWToRGB24(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { - int y; - void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = - RAWToRGB24Row_C; - if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - // Coalesce rows. - if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { - width *= height; - height = 1; - src_stride_raw = dst_stride_rgb24 = 0; - } -#if defined(HAS_RAWTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - RAWToRGB24Row = RAWToRGB24Row_SSSE3; - } - } -#endif -#if defined(HAS_RAWTORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToRGB24Row = RAWToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToRGB24Row = RAWToRGB24Row_NEON; - } - } -#endif -#if defined(HAS_RAWTORGB24ROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - RAWToRGB24Row = RAWToRGB24Row_Any_MMI; - if (IS_ALIGNED(width, 4)) { - RAWToRGB24Row = RAWToRGB24Row_MMI; - } - } -#endif -#if defined(HAS_RAWTORGB24ROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - RAWToRGB24Row = RAWToRGB24Row_Any_MSA; - if (IS_ALIGNED(width, 16)) { - RAWToRGB24Row = RAWToRGB24Row_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - RAWToRGB24Row(src_raw, dst_rgb24, width); - src_raw += src_stride_raw; - dst_rgb24 += dst_stride_rgb24; - } - return 0; -} - -LIBYUV_API -void SetPlane(uint8_t* dst_y, - int dst_stride_y, - int width, - int height, - uint32_t value) { - int y; - void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (dst_stride_y == width) { - width *= height; - height = 1; - dst_stride_y = 0; - } -#if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SetRow = SetRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SetRow = SetRow_NEON; - } - } -#endif -#if defined(HAS_SETROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - SetRow = SetRow_Any_X86; - if (IS_ALIGNED(width, 4)) { - SetRow = SetRow_X86; - } - } -#endif -#if defined(HAS_SETROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - SetRow = SetRow_ERMS; - } -#endif -#if defined(HAS_SETROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { - SetRow = SetRow_MSA; - } -#endif - - // Set plane - for (y = 0; y < height; ++y) { - SetRow(dst_y, value, width); - dst_y += dst_stride_y; - } -} - -// Draw a rectangle into I420 -LIBYUV_API -int I420Rect(uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int x, - int y, - int width, - int height, - int value_y, - int value_u, - int value_v) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - uint8_t* start_y = dst_y + y * dst_stride_y + x; - uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); - uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); - if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || - y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || - value_v < 0 || value_v > 255) { - return -1; - } - - SetPlane(start_y, dst_stride_y, width, height, value_y); - SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); - SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); - return 0; -} - -// Draw a rectangle into ARGB -LIBYUV_API -int ARGBRect(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height, - uint32_t value) { - int y; - void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = - ARGBSetRow_C; - if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { - return -1; - } - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - dst_argb += dst_y * dst_stride_argb + dst_x * 4; - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } - -#if defined(HAS_ARGBSETROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBSetRow = ARGBSetRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { - ARGBSetRow = ARGBSetRow_NEON; - } - } -#endif -#if defined(HAS_ARGBSETROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - ARGBSetRow = ARGBSetRow_X86; - } -#endif -#if defined(HAS_ARGBSETROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBSetRow = ARGBSetRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - ARGBSetRow = ARGBSetRow_MMI; - } - } -#endif -#if defined(HAS_ARGBSETROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBSetRow = ARGBSetRow_Any_MSA; - if (IS_ALIGNED(width, 4)) { - ARGBSetRow = ARGBSetRow_MSA; - } - } -#endif - - // Set plane - for (y = 0; y < height; ++y) { - ARGBSetRow(dst_argb, value, width); - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert unattentuated ARGB to preattenuated ARGB. -// An unattenutated ARGB alpha blend uses the formula -// p = a * f + (1 - a) * b -// where -// p is output pixel -// f is foreground pixel -// b is background pixel -// a is alpha value from foreground pixel -// An preattenutated ARGB alpha blend uses the formula -// p = f + (1 - a) * b -// where -// f is foreground pixel premultiplied by alpha - -LIBYUV_API -int ARGBAttenuate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBAttenuateRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBAttenuateRow = ARGBAttenuateRow_MMI; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBAttenuateRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert preattentuated ARGB to unattenuated ARGB. -LIBYUV_API -int ARGBUnattenuate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBUnattenuateRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBUNATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBUNATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; - } - } -#endif - // TODO(fbarchard): Neon version. - - for (y = 0; y < height; ++y) { - ARGBUnattenuateRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert ARGB to Grayed ARGB. -LIBYUV_API -int ARGBGrayTo(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = - ARGBGrayRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; - } -#endif -#if defined(HAS_ARGBGRAYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_NEON; - } -#endif -#if defined(HAS_ARGBGRAYROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBGrayRow = ARGBGrayRow_MMI; - } -#endif -#if defined(HAS_ARGBGRAYROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_MSA; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBGrayRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Make a rectangle of ARGB gray scale. -LIBYUV_API -int ARGBGray(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height) { - int y; - void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = - ARGBGrayRow_C; - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { - return -1; - } - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } -#if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; - } -#endif -#if defined(HAS_ARGBGRAYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_NEON; - } -#endif -#if defined(HAS_ARGBGRAYROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBGrayRow = ARGBGrayRow_MMI; - } -#endif -#if defined(HAS_ARGBGRAYROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_MSA; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBGrayRow(dst, dst, width); - dst += dst_stride_argb; - } - return 0; -} - -// Make a rectangle of ARGB Sepia tone. -LIBYUV_API -int ARGBSepia(uint8_t* dst_argb, - int dst_stride_argb, - int dst_x, - int dst_y, - int width, - int height) { - int y; - void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { - return -1; - } - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } -#if defined(HAS_ARGBSEPIAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_SSSE3; - } -#endif -#if defined(HAS_ARGBSEPIAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_NEON; - } -#endif -#if defined(HAS_ARGBSEPIAROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBSepiaRow = ARGBSepiaRow_MMI; - } -#endif -#if defined(HAS_ARGBSEPIAROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_MSA; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBSepiaRow(dst, width); - dst += dst_stride_argb; - } - return 0; -} - -// Apply a 4x4 matrix to each ARGB pixel. -// Note: Normally for shading, but can be used to swizzle or invert. -LIBYUV_API -int ARGBColorMatrix(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const int8_t* matrix_argb, - int width, - int height) { - int y; - void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, - const int8_t* matrix_argb, int width) = - ARGBColorMatrixRow_C; - if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; - } -#endif -#if defined(HAS_ARGBCOLORMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; - } -#endif -#if defined(HAS_ARGBCOLORMATRIXROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; - } -#endif -#if defined(HAS_ARGBCOLORMATRIXROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; - } -#endif - for (y = 0; y < height; ++y) { - ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Apply a 4x3 matrix to each ARGB pixel. -// Deprecated. -LIBYUV_API -int RGBColorMatrix(uint8_t* dst_argb, - int dst_stride_argb, - const int8_t* matrix_rgb, - int dst_x, - int dst_y, - int width, - int height) { - SIMD_ALIGNED(int8_t matrix_argb[16]); - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || - dst_y < 0) { - return -1; - } - - // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. - matrix_argb[0] = matrix_rgb[0] / 2; - matrix_argb[1] = matrix_rgb[1] / 2; - matrix_argb[2] = matrix_rgb[2] / 2; - matrix_argb[3] = matrix_rgb[3] / 2; - matrix_argb[4] = matrix_rgb[4] / 2; - matrix_argb[5] = matrix_rgb[5] / 2; - matrix_argb[6] = matrix_rgb[6] / 2; - matrix_argb[7] = matrix_rgb[7] / 2; - matrix_argb[8] = matrix_rgb[8] / 2; - matrix_argb[9] = matrix_rgb[9] / 2; - matrix_argb[10] = matrix_rgb[10] / 2; - matrix_argb[11] = matrix_rgb[11] / 2; - matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; - matrix_argb[15] = 64; // 1.0 - - return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, - dst_stride_argb, &matrix_argb[0], width, height); -} - -// Apply a color table each ARGB pixel. -// Table contains 256 ARGB values. -LIBYUV_API -int ARGBColorTable(uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* table_argb, - int dst_x, - int dst_y, - int width, - int height) { - int y; - void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, - int width) = ARGBColorTableRow_C; - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || - dst_y < 0) { - return -1; - } - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } -#if defined(HAS_ARGBCOLORTABLEROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - ARGBColorTableRow = ARGBColorTableRow_X86; - } -#endif - for (y = 0; y < height; ++y) { - ARGBColorTableRow(dst, table_argb, width); - dst += dst_stride_argb; - } - return 0; -} - -// Apply a color table each ARGB pixel but preserve destination alpha. -// Table contains 256 ARGB values. -LIBYUV_API -int RGBColorTable(uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* table_argb, - int dst_x, - int dst_y, - int width, - int height) { - int y; - void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, - int width) = RGBColorTableRow_C; - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || - dst_y < 0) { - return -1; - } - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } -#if defined(HAS_RGBCOLORTABLEROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - RGBColorTableRow = RGBColorTableRow_X86; - } -#endif - for (y = 0; y < height; ++y) { - RGBColorTableRow(dst, table_argb, width); - dst += dst_stride_argb; - } - return 0; -} - -// ARGBQuantize is used to posterize art. -// e.g. rgb / qvalue * qvalue + qvalue / 2 -// But the low levels implement efficiently with 3 parameters, and could be -// used for other high level operations. -// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; -// where scale is 1 / interval_size as a fixed point value. -// The divide is replaces with a multiply by reciprocal fixed point multiply. -// Caveat - although SSE2 saturates, the C function does not and should be used -// with care if doing anything but quantization. -LIBYUV_API -int ARGBQuantize(uint8_t* dst_argb, - int dst_stride_argb, - int scale, - int interval_size, - int interval_offset, - int dst_x, - int dst_y, - int width, - int height) { - int y; - void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, - int interval_offset, int width) = ARGBQuantizeRow_C; - uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || - interval_size < 1 || interval_size > 255) { - return -1; - } - // Coalesce rows. - if (dst_stride_argb == width * 4) { - width *= height; - height = 1; - dst_stride_argb = 0; - } -#if defined(HAS_ARGBQUANTIZEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBQuantizeRow = ARGBQuantizeRow_SSE2; - } -#endif -#if defined(HAS_ARGBQUANTIZEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBQuantizeRow = ARGBQuantizeRow_NEON; - } -#endif -#if defined(HAS_ARGBQUANTIZEROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { - ARGBQuantizeRow = ARGBQuantizeRow_MSA; - } -#endif - for (y = 0; y < height; ++y) { - ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); - dst += dst_stride_argb; - } - return 0; -} - -// Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. -LIBYUV_API -int ARGBComputeCumulativeSum(const uint8_t* src_argb, - int src_stride_argb, - int32_t* dst_cumsum, - int dst_stride32_cumsum, - int width, - int height) { - int y; - void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, - const int32_t* previous_cumsum, int width) = - ComputeCumulativeSumRow_C; - int32_t* previous_cumsum = dst_cumsum; - if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { - return -1; - } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; - } -#endif -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; - } -#endif - - memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. - for (y = 0; y < height; ++y) { - ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); - previous_cumsum = dst_cumsum; - dst_cumsum += dst_stride32_cumsum; - src_argb += src_stride_argb; - } - return 0; -} - -// Blur ARGB image. -// Caller should allocate CumulativeSum table of width * height * 16 bytes -// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory -// as the buffer is treated as circular. -LIBYUV_API -int ARGBBlur(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int32_t* dst_cumsum, - int dst_stride32_cumsum, - int width, - int height, - int radius) { - int y; - void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, - const int32_t* previous_cumsum, int width) = - ComputeCumulativeSumRow_C; - void (*CumulativeSumToAverageRow)( - const int32_t* topleft, const int32_t* botleft, int width, int area, - uint8_t* dst, int count) = CumulativeSumToAverageRow_C; - int32_t* cumsum_bot_row; - int32_t* max_cumsum_bot_row; - int32_t* cumsum_top_row; - - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - if (radius > height) { - radius = height; - } - if (radius > (width / 2 - 1)) { - radius = width / 2 - 1; - } - if (radius <= 0) { - return -1; - } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; - CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; - } -#endif -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; - } -#endif - // Compute enough CumulativeSum for first row to be blurred. After this - // one row of CumulativeSum is updated at a time. - ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, - dst_stride32_cumsum, width, radius); - - src_argb = src_argb + radius * src_stride_argb; - cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; - - max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; - cumsum_top_row = &dst_cumsum[0]; - - for (y = 0; y < height; ++y) { - int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; - int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); - int area = radius * (bot_y - top_y); - int boxwidth = radius * 4; - int x; - int n; - - // Increment cumsum_top_row pointer with circular buffer wrap around. - if (top_y) { - cumsum_top_row += dst_stride32_cumsum; - if (cumsum_top_row >= max_cumsum_bot_row) { - cumsum_top_row = dst_cumsum; - } - } - // Increment cumsum_bot_row pointer with circular buffer wrap around and - // then fill in a row of CumulativeSum. - if ((y + radius) < height) { - const int32_t* prev_cumsum_bot_row = cumsum_bot_row; - cumsum_bot_row += dst_stride32_cumsum; - if (cumsum_bot_row >= max_cumsum_bot_row) { - cumsum_bot_row = dst_cumsum; - } - ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, - width); - src_argb += src_stride_argb; - } - - // Left clipped. - for (x = 0; x < radius + 1; ++x) { - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, - &dst_argb[x * 4], 1); - area += (bot_y - top_y); - boxwidth += 4; - } - - // Middle unclipped. - n = (width - 1) - radius - x + 1; - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, - &dst_argb[x * 4], n); - - // Right clipped. - for (x += n; x <= width - 1; ++x) { - area -= (bot_y - top_y); - boxwidth -= 4; - CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, boxwidth, - area, &dst_argb[x * 4], 1); - } - dst_argb += dst_stride_argb; - } - return 0; -} - -// Multiply ARGB image by a specified ARGB value. -LIBYUV_API -int ARGBShade(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - uint32_t value) { - int y; - void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, - uint32_t value) = ARGBShadeRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBSHADEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBShadeRow = ARGBShadeRow_SSE2; - } -#endif -#if defined(HAS_ARGBSHADEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBShadeRow = ARGBShadeRow_NEON; - } -#endif -#if defined(HAS_ARGBSHADEROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { - ARGBShadeRow = ARGBShadeRow_MMI; - } -#endif -#if defined(HAS_ARGBSHADEROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { - ARGBShadeRow = ARGBShadeRow_MSA; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBShadeRow(src_argb, dst_argb, width, value); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Interpolate 2 planes by specified amount (0 to 255). -LIBYUV_API -int InterpolatePlane(const uint8_t* src0, - int src_stride0, - const uint8_t* src1, - int src_stride1, - uint8_t* dst, - int dst_stride, - int width, - int height, - int interpolation) { - int y; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - if (!src0 || !src1 || !dst || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst = dst + (height - 1) * dst_stride; - dst_stride = -dst_stride; - } - // Coalesce rows. - if (src_stride0 == width && src_stride1 == width && dst_stride == width) { - width *= height; - height = 1; - src_stride0 = src_stride1 = dst_stride = 0; - } -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - InterpolateRow(dst, src0, src1 - src0, width, interpolation); - src0 += src_stride0; - src1 += src_stride1; - dst += dst_stride; - } - return 0; -} - -// Interpolate 2 ARGB images by specified amount (0 to 255). -LIBYUV_API -int ARGBInterpolate(const uint8_t* src_argb0, - int src_stride_argb0, - const uint8_t* src_argb1, - int src_stride_argb1, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - int interpolation) { - return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, - src_stride_argb1, dst_argb, dst_stride_argb, - width * 4, height, interpolation); -} - -// Interpolate 2 YUV images by specified amount (0 to 255). -LIBYUV_API -int I420Interpolate(const uint8_t* src0_y, - int src0_stride_y, - const uint8_t* src0_u, - int src0_stride_u, - const uint8_t* src0_v, - int src0_stride_v, - const uint8_t* src1_y, - int src1_stride_y, - const uint8_t* src1_u, - int src1_stride_u, - const uint8_t* src1_v, - int src1_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - int interpolation) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || - !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, - dst_stride_y, width, height, interpolation); - InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, - dst_stride_u, halfwidth, halfheight, interpolation); - InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, - dst_stride_v, halfwidth, halfheight, interpolation); - return 0; -} - -// Shuffle ARGB channel order. e.g. BGRA to ARGB. -LIBYUV_API -int ARGBShuffle(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* shuffler, - int width, - int height) { - int y; - void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, - const uint8_t* shuffler, int width) = ARGBShuffleRow_C; - if (!src_bgra || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_bgra = src_bgra + (height - 1) * src_stride_bgra; - src_stride_bgra = -src_stride_bgra; - } - // Coalesce rows. - if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_bgra = dst_stride_argb = 0; - } -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGBShuffleRow = ARGBShuffleRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBShuffleRow = ARGBShuffleRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { - ARGBShuffleRow = ARGBShuffleRow_NEON; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBShuffleRow = ARGBShuffleRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBShuffleRow = ARGBShuffleRow_MMI; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBShuffleRow = ARGBShuffleRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); - src_bgra += src_stride_bgra; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Shuffle AR64 channel order. e.g. AR64 to AB64. -LIBYUV_API -int AR64Shuffle(const uint16_t* src_ar64, - int src_stride_ar64, - uint16_t* dst_ar64, - int dst_stride_ar64, - const uint8_t* shuffler, - int width, - int height) { - int y; - void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, - const uint8_t* shuffler, int width) = AR64ShuffleRow_C; - if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; - src_stride_ar64 = -src_stride_ar64; - } - // Coalesce rows. - if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { - width *= height; - height = 1; - src_stride_ar64 = dst_stride_ar64 = 0; - } - // Assembly versions can be reused if it's implemented with shuffle. -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - AR64ShuffleRow = ARGBShuffleRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - AR64ShuffleRow = ARGBShuffleRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - AR64ShuffleRow = ARGBShuffleRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { - AR64ShuffleRow = ARGBShuffleRow_NEON; - } - } -#endif -#if defined(HAS_ARGBSHUFFLEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - AR64ShuffleRow = ARGBShuffleRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - AR64ShuffleRow = ARGBShuffleRow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, - width * 2); - src_ar64 += src_stride_ar64; - dst_ar64 += dst_stride_ar64; - } - return 0; -} - -// Gauss blur a float plane using Gaussian 5x5 filter with -// coefficients of 1, 4, 6, 4, 1. -// Each destination pixel is a blur of the 5x5 -// pixels from the source. -// Source edges are clamped. -// Edge is 2 pixels on each side, and interior is multiple of 4. -LIBYUV_API -int GaussPlane_F32(const float* src, - int src_stride, - float* dst, - int dst_stride, - int width, - int height) { - int y; - void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, - const float* src3, const float* src4, float* dst, - int width) = GaussCol_F32_C; - void (*GaussRow_F32)(const float* src, float* dst, int width) = - GaussRow_F32_C; - if (!src || !dst || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src = src + (height - 1) * src_stride; - src_stride = -src_stride; - } - -#if defined(HAS_GAUSSCOL_F32_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - GaussCol_F32 = GaussCol_F32_NEON; - } -#endif -#if defined(HAS_GAUSSROW_F32_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - GaussRow_F32 = GaussRow_F32_NEON; - } -#endif - { - // 2 pixels on each side, but aligned out to 16 bytes. - align_buffer_64(rowbuf, (4 + width + 4) * 4); - memset(rowbuf, 0, 16); - memset(rowbuf + (4 + width) * 4, 0, 16); - float* row = (float*)(rowbuf + 16); - const float* src0 = src; - const float* src1 = src; - const float* src2 = src; - const float* src3 = src2 + ((height > 1) ? src_stride : 0); - const float* src4 = src3 + ((height > 2) ? src_stride : 0); - - for (y = 0; y < height; ++y) { - GaussCol_F32(src0, src1, src2, src3, src4, row, width); - - // Extrude edge by 2 floats - row[-2] = row[-1] = row[0]; - row[width + 1] = row[width] = row[width - 1]; - - GaussRow_F32(row - 2, dst, width); - - src0 = src1; - src1 = src2; - src2 = src3; - src3 = src4; - if ((y + 2) < (height - 1)) { - src4 += src_stride; - } - dst += dst_stride; - } - free_aligned_buffer_64(rowbuf); - } - return 0; -} - -// Sobel ARGB effect. -static int ARGBSobelize(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - void (*SobelRow)(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst, - int width)) { - int y; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = - ARGBToYJRow_C; - void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, - uint8_t* dst_sobely, int width) = SobelYRow_C; - void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, - const uint8_t* src_y2, uint8_t* dst_sobely, int width) = - SobelXRow_C; - const int kEdge = 16; // Extra pixels at start of row for extrude/align. - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - -#if defined(HAS_ARGBTOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBToYJRow = ARGBToYJRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBToYJRow = ARGBToYJRow_MMI; - } - } -#endif -#if defined(HAS_ARGBTOYJROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; - } - } -#endif - -#if defined(HAS_SOBELYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SobelYRow = SobelYRow_SSE2; - } -#endif -#if defined(HAS_SOBELYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelYRow = SobelYRow_NEON; - } -#endif -#if defined(HAS_SOBELYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelYRow = SobelYRow_MMI; - } -#endif -#if defined(HAS_SOBELYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelYRow = SobelYRow_MSA; - } -#endif -#if defined(HAS_SOBELXROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SobelXRow = SobelXRow_SSE2; - } -#endif -#if defined(HAS_SOBELXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelXRow = SobelXRow_NEON; - } -#endif -#if defined(HAS_SOBELXROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelXRow = SobelXRow_MMI; - } -#endif -#if defined(HAS_SOBELXROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelXRow = SobelXRow_MSA; - } -#endif - { - // 3 rows with edges before/after. - const int kRowSize = (width + kEdge + 31) & ~31; - align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); - uint8_t* row_sobelx = rows; - uint8_t* row_sobely = rows + kRowSize; - uint8_t* row_y = rows + kRowSize * 2; - - // Convert first row. - uint8_t* row_y0 = row_y + kEdge; - uint8_t* row_y1 = row_y0 + kRowSize; - uint8_t* row_y2 = row_y1 + kRowSize; - ARGBToYJRow(src_argb, row_y0, width); - row_y0[-1] = row_y0[0]; - memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. - ARGBToYJRow(src_argb, row_y1, width); - row_y1[-1] = row_y1[0]; - memset(row_y1 + width, row_y1[width - 1], 16); - memset(row_y2 + width, 0, 16); - - for (y = 0; y < height; ++y) { - // Convert next row of ARGB to G. - if (y < (height - 1)) { - src_argb += src_stride_argb; - } - ARGBToYJRow(src_argb, row_y2, width); - row_y2[-1] = row_y2[0]; - row_y2[width] = row_y2[width - 1]; - - SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); - SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); - SobelRow(row_sobelx, row_sobely, dst_argb, width); - - // Cycle thru circular queue of 3 row_y buffers. - { - uint8_t* row_yt = row_y0; - row_y0 = row_y1; - row_y1 = row_y2; - row_y2 = row_yt; - } - - dst_argb += dst_stride_argb; - } - free_aligned_buffer_64(rows); - } - return 0; -} - -// Sobel ARGB effect. -LIBYUV_API -int ARGBSobel(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, - uint8_t* dst_argb, int width) = SobelRow_C; -#if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SobelRow = SobelRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_SSE2; - } - } -#endif -#if defined(HAS_SOBELROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelRow = SobelRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - SobelRow = SobelRow_NEON; - } - } -#endif -#if defined(HAS_SOBELROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelRow = SobelRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelRow = SobelRow_MMI; - } - } -#endif -#if defined(HAS_SOBELROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelRow = SobelRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_MSA; - } - } -#endif - return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height, SobelRow); -} - -// Sobel ARGB effect with planar output. -LIBYUV_API -int ARGBSobelToPlane(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height) { - void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, - uint8_t* dst_, int width) = SobelToPlaneRow_C; -#if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_SSE2; - } - } -#endif -#if defined(HAS_SOBELTOPLANEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelToPlaneRow = SobelToPlaneRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_NEON; - } - } -#endif -#if defined(HAS_SOBELTOPLANEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelToPlaneRow = SobelToPlaneRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelToPlaneRow = SobelToPlaneRow_MMI; - } - } -#endif -#if defined(HAS_SOBELTOPLANEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelToPlaneRow = SobelToPlaneRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SobelToPlaneRow = SobelToPlaneRow_MSA; - } - } -#endif - return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, - height, SobelToPlaneRow); -} - -// SobelXY ARGB effect. -// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. -LIBYUV_API -int ARGBSobelXY(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, - uint8_t* dst_argb, int width) = SobelXYRow_C; -#if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SobelXYRow = SobelXYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_SSE2; - } - } -#endif -#if defined(HAS_SOBELXYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelXYRow = SobelXYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - SobelXYRow = SobelXYRow_NEON; - } - } -#endif -#if defined(HAS_SOBELXYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SobelXYRow = SobelXYRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SobelXYRow = SobelXYRow_MMI; - } - } -#endif -#if defined(HAS_SOBELXYROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SobelXYRow = SobelXYRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_MSA; - } - } -#endif - return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height, SobelXYRow); -} - -// Apply a 4x4 polynomial to each ARGB pixel. -LIBYUV_API -int ARGBPolynomial(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const float* poly, - int width, - int height) { - int y; - void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, - const float* poly, int width) = ARGBPolynomialRow_C; - if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { - ARGBPolynomialRow = ARGBPolynomialRow_SSE2; - } -#endif -#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && - IS_ALIGNED(width, 2)) { - ARGBPolynomialRow = ARGBPolynomialRow_AVX2; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBPolynomialRow(src_argb, dst_argb, poly, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Convert plane of 16 bit shorts to half floats. -// Source values are multiplied by scale before storing as half float. -LIBYUV_API -int HalfFloatPlane(const uint16_t* src_y, - int src_stride_y, - uint16_t* dst_y, - int dst_stride_y, - float scale, - int width, - int height) { - int y; - void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, - int width) = HalfFloatRow_C; - if (!src_y || !dst_y || width <= 0 || height == 0) { - return -1; - } - src_stride_y >>= 1; - dst_stride_y >>= 1; - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } -#if defined(HAS_HALFFLOATROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - HalfFloatRow = HalfFloatRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - HalfFloatRow = HalfFloatRow_SSE2; - } - } -#endif -#if defined(HAS_HALFFLOATROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - HalfFloatRow = HalfFloatRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - HalfFloatRow = HalfFloatRow_AVX2; - } - } -#endif -#if defined(HAS_HALFFLOATROW_F16C) - if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { - HalfFloatRow = - (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; - if (IS_ALIGNED(width, 16)) { - HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; - } - } -#endif -#if defined(HAS_HALFFLOATROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - HalfFloatRow = - (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; - } - } -#endif -#if defined(HAS_HALFFLOATROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - HalfFloatRow = HalfFloatRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - HalfFloatRow = HalfFloatRow_MSA; - } - } -#endif - - for (y = 0; y < height; ++y) { - HalfFloatRow(src_y, dst_y, scale, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } - return 0; -} - -// Convert a buffer of bytes to floats, scale the values and store as floats. -LIBYUV_API -int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { - void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, - int width) = ByteToFloatRow_C; - if (!src_y || !dst_y || width <= 0) { - return -1; - } -#if defined(HAS_BYTETOFLOATROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ByteToFloatRow = ByteToFloatRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ByteToFloatRow = ByteToFloatRow_NEON; - } - } -#endif - - ByteToFloatRow(src_y, dst_y, scale, width); - return 0; -} - -// Apply a lumacolortable to each ARGB pixel. -LIBYUV_API -int ARGBLumaColorTable(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - const uint8_t* luma, - int width, - int height) { - int y; - void (*ARGBLumaColorTableRow)( - const uint8_t* src_argb, uint8_t* dst_argb, int width, - const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; - if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { - ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; - } -#endif - - for (y = 0; y < height; ++y) { - ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Copy Alpha from one ARGB image to another. -LIBYUV_API -int ARGBCopyAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, - int width) = ARGBCopyAlphaRow_C; - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_argb = 0; - } -#if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBCOPYALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBCOPYALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBCopyAlphaRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - -// Extract just the alpha channel from ARGB. -LIBYUV_API -int ARGBExtractAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height) { - if (!src_argb || !dst_a || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb += (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_a == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_a = 0; - } - void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, - int width) = ARGBExtractAlphaRow_C; -#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 - : ARGBExtractAlphaRow_Any_SSE2; - } -#endif -#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 - : ARGBExtractAlphaRow_Any_AVX2; - } -#endif -#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON - : ARGBExtractAlphaRow_Any_NEON; - } -#endif -#if defined(HAS_ARGBEXTRACTALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI - : ARGBExtractAlphaRow_Any_MMI; - } -#endif -#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA - : ARGBExtractAlphaRow_Any_MSA; - } -#endif - - for (int y = 0; y < height; ++y) { - ARGBExtractAlphaRow(src_argb, dst_a, width); - src_argb += src_stride_argb; - dst_a += dst_stride_a; - } - return 0; -} - -// Copy a planar Y channel to the alpha channel of a destination ARGB image. -LIBYUV_API -int ARGBCopyYToAlpha(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int y; - void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, - int width) = ARGBCopyYToAlphaRow_C; - if (!src_y || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_stride_y = -src_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = dst_stride_argb = 0; - } -#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBCopyYToAlphaRow(src_y, dst_argb, width); - src_y += src_stride_y; - dst_argb += dst_stride_argb; - } - return 0; -} - -// TODO(fbarchard): Consider if width is even Y channel can be split -// directly. A SplitUVRow_Odd function could copy the remaining chroma. - -LIBYUV_API -int YUY2ToNV12(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, - int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; - src_stride_yuy2 = -src_stride_yuy2; - } -#if defined(HAS_SPLITUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_NEON; - } - } -#endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; - } - } -#endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - - { - int awidth = halfwidth * 2; - // row of y and 2 rows of uv - align_buffer_64(rows, awidth * 3); - - for (y = 0; y < height - 1; y += 2) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, rows + awidth, awidth); - memcpy(dst_y, rows, width); - SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); - memcpy(dst_y + dst_stride_y, rows, width); - InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); - src_yuy2 += src_stride_yuy2 * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, dst_uv, awidth); - memcpy(dst_y, rows, width); - } - free_aligned_buffer_64(rows); - } - return 0; -} - -LIBYUV_API -int UYVYToNV12(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, - int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; - src_stride_uyvy = -src_stride_uyvy; - } -#if defined(HAS_SPLITUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_AVX2; - } - } -#endif -#if defined(HAS_SPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_NEON; - } - } -#endif -#if defined(HAS_SPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - SplitUVRow = SplitUVRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_MMI; - } - } -#endif -#if defined(HAS_SPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - - { - int awidth = halfwidth * 2; - // row of y and 2 rows of uv - align_buffer_64(rows, awidth * 3); - - for (y = 0; y < height - 1; y += 2) { - // Split Y from UV. - SplitUVRow(src_uyvy, rows + awidth, rows, awidth); - memcpy(dst_y, rows, width); - SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth); - memcpy(dst_y + dst_stride_y, rows, width); - InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); - src_uyvy += src_stride_uyvy * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - // Split Y from UV. - SplitUVRow(src_uyvy, dst_uv, rows, awidth); - memcpy(dst_y, rows, width); - } - free_aligned_buffer_64(rows); - } - return 0; -} - -// width and height are src size allowing odd size handling. -LIBYUV_API -void HalfMergeUVPlane(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int y; - void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, - const uint8_t* src_v, int src_stride_v, - uint8_t* dst_uv, int width) = HalfMergeUVRow_C; - - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } -#if defined(HAS_HALFMERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - HalfMergeUVRow = HalfMergeUVRow_NEON; - } -#endif -#if defined(HAS_HALFMERGEUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - HalfMergeUVRow = HalfMergeUVRow_SSSE3; - } -#endif -#if defined(HAS_HALFMERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - HalfMergeUVRow = HalfMergeUVRow_AVX2; - } -#endif - for (y = 0; y < height - 1; y += 2) { - // Merge a row of U and V into a row of UV. - HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); - src_u += src_stride_u * 2; - src_v += src_stride_v * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate.cc b/thirdparty/libyuv/source/rotate.cc deleted file mode 100644 index 32904e4..0000000 --- a/thirdparty/libyuv/source/rotate.cc +++ /dev/null @@ -1,609 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate.h" - -#include "libyuv/convert.h" -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -LIBYUV_API -void TransposePlane(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - int i = height; -#if defined(HAS_TRANSPOSEWX16_MSA) - void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, - int dst_stride, int width) = TransposeWx16_C; -#else - void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, - int dst_stride, int width) = TransposeWx8_C; -#endif - -#if defined(HAS_TRANSPOSEWX16_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - TransposeWx16 = TransposeWx16_Any_MSA; - if (IS_ALIGNED(width, 16)) { - TransposeWx16 = TransposeWx16_MSA; - } - } -#else -#if defined(HAS_TRANSPOSEWX8_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - TransposeWx8 = TransposeWx8_NEON; - } -#endif -#if defined(HAS_TRANSPOSEWX8_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - TransposeWx8 = TransposeWx8_SSSE3; - } - } -#endif -#if defined(HAS_TRANSPOSEWX8_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - TransposeWx8 = TransposeWx8_MMI; - } -#endif -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - TransposeWx8 = TransposeWx8_Fast_SSSE3; - } - } -#endif -#endif /* defined(HAS_TRANSPOSEWX16_MSA) */ - -#if defined(HAS_TRANSPOSEWX16_MSA) - // Work across the source in 16x16 tiles - while (i >= 16) { - TransposeWx16(src, src_stride, dst, dst_stride, width); - src += 16 * src_stride; // Go down 16 rows. - dst += 16; // Move over 16 columns. - i -= 16; - } -#else - // Work across the source in 8x8 tiles - while (i >= 8) { - TransposeWx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_stride; // Go down 8 rows. - dst += 8; // Move over 8 columns. - i -= 8; - } -#endif - - if (i > 0) { - TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); - } -} - -LIBYUV_API -void RotatePlane90(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - // Rotate by 90 is a transpose with the source read - // from bottom to top. So set the source pointer to the end - // of the buffer and flip the sign of the source stride. - src += src_stride * (height - 1); - src_stride = -src_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); -} - -LIBYUV_API -void RotatePlane270(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - // Rotate by 270 is a transpose with the destination written - // from bottom to top. So set the destination pointer to the end - // of the buffer and flip the sign of the destination stride. - dst += dst_stride * (width - 1); - dst_stride = -dst_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); -} - -LIBYUV_API -void RotatePlane180(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - // Swap first and last row and mirror the content. Uses a temporary row. - align_buffer_64(row, width); - const uint8_t* src_bot = src + src_stride * (height - 1); - uint8_t* dst_bot = dst + dst_stride * (height - 1); - int half_height = (height + 1) >> 1; - int y; - void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; - void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; -#if defined(HAS_MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_NEON; - } - } -#endif -#if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; - } - } -#endif -#if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MirrorRow = MirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; - } - } -#endif -#if defined(HAS_MIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - MirrorRow = MirrorRow_Any_MMI; - if (IS_ALIGNED(width, 8)) { - MirrorRow = MirrorRow_MMI; - } - } -#endif -#if defined(HAS_MIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - MirrorRow = MirrorRow_Any_MSA; - if (IS_ALIGNED(width, 64)) { - MirrorRow = MirrorRow_MSA; - } - } -#endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; - } -#endif -#if defined(HAS_COPYROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI; - } -#endif - - // Odd height will harmlessly mirror the middle row twice. - for (y = 0; y < half_height; ++y) { - CopyRow(src, row, width); // Copy first row into buffer - MirrorRow(src_bot, dst, width); // Mirror last row into first row - MirrorRow(row, dst_bot, width); // Mirror buffer into last row - src += src_stride; - dst += dst_stride; - src_bot -= src_stride; - dst_bot -= dst_stride; - } - free_aligned_buffer_64(row); -} - -LIBYUV_API -void TransposeUV(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int i = height; -#if defined(HAS_TRANSPOSEUVWX16_MSA) - void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, - int dst_stride_a, uint8_t* dst_b, int dst_stride_b, - int width) = TransposeUVWx16_C; -#else - void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, - int dst_stride_a, uint8_t* dst_b, int dst_stride_b, - int width) = TransposeUVWx8_C; -#endif - -#if defined(HAS_TRANSPOSEUVWX16_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - TransposeUVWx16 = TransposeUVWx16_Any_MSA; - if (IS_ALIGNED(width, 8)) { - TransposeUVWx16 = TransposeUVWx16_MSA; - } - } -#else -#if defined(HAS_TRANSPOSEUVWX8_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - TransposeUVWx8 = TransposeUVWx8_NEON; - } -#endif -#if defined(HAS_TRANSPOSEUVWX8_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - TransposeUVWx8 = TransposeUVWx8_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - TransposeUVWx8 = TransposeUVWx8_SSE2; - } - } -#endif -#if defined(HAS_TRANSPOSEUVWX8_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - TransposeUVWx8 = TransposeUVWx8_Any_MMI; - if (IS_ALIGNED(width, 4)) { - TransposeUVWx8 = TransposeUVWx8_MMI; - } - } -#endif -#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ - -#if defined(HAS_TRANSPOSEUVWX16_MSA) - // Work through the source in 8x8 tiles. - while (i >= 16) { - TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width); - src += 16 * src_stride; // Go down 16 rows. - dst_a += 16; // Move over 8 columns. - dst_b += 16; // Move over 8 columns. - i -= 16; - } -#else - // Work through the source in 8x8 tiles. - while (i >= 8) { - TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width); - src += 8 * src_stride; // Go down 8 rows. - dst_a += 8; // Move over 8 columns. - dst_b += 8; // Move over 8 columns. - i -= 8; - } -#endif - - if (i > 0) { - TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width, i); - } -} - -LIBYUV_API -void RotateUV90(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - src += src_stride * (height - 1); - src_stride = -src_stride; - - TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, - height); -} - -LIBYUV_API -void RotateUV270(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - dst_a += dst_stride_a * (width - 1); - dst_b += dst_stride_b * (width - 1); - dst_stride_a = -dst_stride_a; - dst_stride_b = -dst_stride_b; - - TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, - height); -} - -// Rotate 180 is a horizontal and vertical flip. -LIBYUV_API -void RotateUV180(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int i; - void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, - int width) = MirrorSplitUVRow_C; -#if defined(HAS_MIRRORSPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - MirrorSplitUVRow = MirrorSplitUVRow_NEON; - } -#endif -#if defined(HAS_MIRRORSPLITUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; - } -#endif -#if defined(HAS_MIRRORSPLITUVROW_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { - MirrorSplitUVRow = MirrorSplitUVRow_MMI; - } -#endif -#if defined(HAS_MIRRORSPLITUVROW_MSA) - if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { - MirrorSplitUVRow = MirrorSplitUVRow_MSA; - } -#endif - - dst_a += dst_stride_a * (height - 1); - dst_b += dst_stride_b * (height - 1); - - for (i = 0; i < height; ++i) { - MirrorSplitUVRow(src, dst_a, dst_b, width); - src += src_stride; - dst_a -= dst_stride_a; - dst_b -= dst_stride_b; - } -} - -LIBYUV_API -int RotatePlane(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height, - enum RotationMode mode) { - if (!src || width <= 0 || height == 0 || !dst) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - src = src + (height - 1) * src_stride; - src_stride = -src_stride; - } - - switch (mode) { - case kRotate0: - // copy frame - CopyPlane(src, src_stride, dst, dst_stride, width, height); - return 0; - case kRotate90: - RotatePlane90(src, src_stride, dst, dst_stride, width, height); - return 0; - case kRotate270: - RotatePlane270(src, src_stride, dst, dst_stride, width, height); - return 0; - case kRotate180: - RotatePlane180(src, src_stride, dst, dst_stride, width, height); - return 0; - default: - break; - } - return -1; -} - -LIBYUV_API -int I420Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || - !dst_u || !dst_v) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - switch (mode) { - case kRotate0: - // copy frame - return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height); - case kRotate90: - RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - case kRotate270: - RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - case kRotate180: - RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - default: - break; - } - return -1; -} - -LIBYUV_API -int I444Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum libyuv::RotationMode mode) { - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || - !dst_u || !dst_v) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - switch (mode) { - case libyuv::kRotate0: - // copy frame - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); - return 0; - case libyuv::kRotate90: - RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height); - RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height); - return 0; - case libyuv::kRotate270: - RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height); - RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height); - return 0; - case libyuv::kRotate180: - RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height); - RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height); - return 0; - default: - break; - } - return -1; -} - -LIBYUV_API -int NV12ToI420Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || - !dst_v) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_uv = src_uv + (halfheight - 1) * src_stride_uv; - src_stride_y = -src_stride_y; - src_stride_uv = -src_stride_uv; - } - - switch (mode) { - case kRotate0: - // copy frame - return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, - dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, - width, height); - case kRotate90: - RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); - return 0; - case kRotate270: - RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); - return 0; - case kRotate180: - RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, - dst_stride_v, halfwidth, halfheight); - return 0; - default: - break; - } - return -1; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_any.cc b/thirdparty/libyuv/source/rotate_any.cc deleted file mode 100644 index b3baf08..0000000 --- a/thirdparty/libyuv/source/rotate_any.cc +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate.h" -#include "libyuv/rotate_row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ - int dst_stride, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ - } \ - TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ - } - -#ifdef HAS_TRANSPOSEWX8_NEON -TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) -#endif -#ifdef HAS_TRANSPOSEWX8_SSSE3 -TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) -#endif -#ifdef HAS_TRANSPOSEWX8_MMI -TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7) -#endif -#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 -TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) -#endif -#ifdef HAS_TRANSPOSEWX16_MSA -TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) -#endif -#undef TANY - -#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ - int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ - int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ - } \ - TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ - dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ - } - -#ifdef HAS_TRANSPOSEUVWX8_NEON -TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) -#endif -#ifdef HAS_TRANSPOSEUVWX8_SSE2 -TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) -#endif -#ifdef HAS_TRANSPOSEUVWX8_MMI -TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7) -#endif -#ifdef HAS_TRANSPOSEUVWX16_MSA -TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) -#endif -#undef TUVANY - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_argb.cc b/thirdparty/libyuv/source/rotate_argb.cc deleted file mode 100644 index ae65388..0000000 --- a/thirdparty/libyuv/source/rotate_argb.cc +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate.h" - -#include "libyuv/convert.h" -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" -#include "libyuv/row.h" -#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -static int ARGBTranspose(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - int i; - int src_pixel_step = src_stride_argb >> 2; - void (*ScaleARGBRowDownEven)( - const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, - uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; - // Check stride is a multiple of 4. - if (src_stride_argb & 3) { - return -1; - } -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; - if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; - if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; - if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; - if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; - } - } -#endif - - for (i = 0; i < width; ++i) { // column of source to row of dest. - ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); - dst_argb += dst_stride_argb; - src_argb += 4; - } - return 0; -} - -static int ARGBRotate90(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - // Rotate by 90 is a ARGBTranspose with the source read - // from bottom to top. So set the source pointer to the end - // of the buffer and flip the sign of the source stride. - src_argb += src_stride_argb * (height - 1); - src_stride_argb = -src_stride_argb; - return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); -} - -static int ARGBRotate270(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - // Rotate by 270 is a ARGBTranspose with the destination written - // from bottom to top. So set the destination pointer to the end - // of the buffer and flip the sign of the destination stride. - dst_argb += dst_stride_argb * (width - 1); - dst_stride_argb = -dst_stride_argb; - return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); -} - -static int ARGBRotate180(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { - // Swap first and last row and mirror the content. Uses a temporary row. - align_buffer_64(row, width * 4); - const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); - uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); - int half_height = (height + 1) >> 1; - int y; - void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = - ARGBMirrorRow_C; - void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = - CopyRow_C; -#if defined(HAS_ARGBMIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_NEON; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MMI; - if (IS_ALIGNED(width, 2)) { - ARGBMirrorRow = ARGBMirrorRow_MMI; - } - } -#endif -#if defined(HAS_ARGBMIRRORROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ARGBMirrorRow = ARGBMirrorRow_Any_MSA; - if (IS_ALIGNED(width, 16)) { - ARGBMirrorRow = ARGBMirrorRow_MSA; - } - } -#endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; - } -#endif - - // Odd height will harmlessly mirror the middle row twice. - for (y = 0; y < half_height; ++y) { - ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer - ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row - CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - src_bot -= src_stride_argb; - dst_bot -= dst_stride_argb; - } - free_aligned_buffer_64(row); - return 0; -} - -LIBYUV_API -int ARGBRotate(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height, - enum RotationMode mode) { - if (!src_argb || width <= 0 || height == 0 || !dst_argb) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - - switch (mode) { - case kRotate0: - // copy frame - return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); - case kRotate90: - return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); - case kRotate270: - return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); - case kRotate180: - return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width, height); - default: - break; - } - return -1; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_common.cc b/thirdparty/libyuv/source/rotate_common.cc deleted file mode 100644 index ff212ad..0000000 --- a/thirdparty/libyuv/source/rotate_common.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -void TransposeWx8_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst[0] = src[0 * src_stride]; - dst[1] = src[1 * src_stride]; - dst[2] = src[2 * src_stride]; - dst[3] = src[3 * src_stride]; - dst[4] = src[4 * src_stride]; - dst[5] = src[5 * src_stride]; - dst[6] = src[6 * src_stride]; - dst[7] = src[7 * src_stride]; - ++src; - dst += dst_stride; - } -} - -void TransposeUVWx8_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst_a[0] = src[0 * src_stride + 0]; - dst_b[0] = src[0 * src_stride + 1]; - dst_a[1] = src[1 * src_stride + 0]; - dst_b[1] = src[1 * src_stride + 1]; - dst_a[2] = src[2 * src_stride + 0]; - dst_b[2] = src[2 * src_stride + 1]; - dst_a[3] = src[3 * src_stride + 0]; - dst_b[3] = src[3 * src_stride + 1]; - dst_a[4] = src[4 * src_stride + 0]; - dst_b[4] = src[4 * src_stride + 1]; - dst_a[5] = src[5 * src_stride + 0]; - dst_b[5] = src[5 * src_stride + 1]; - dst_a[6] = src[6 * src_stride + 0]; - dst_b[6] = src[6 * src_stride + 1]; - dst_a[7] = src[7 * src_stride + 0]; - dst_b[7] = src[7 * src_stride + 1]; - src += 2; - dst_a += dst_stride_a; - dst_b += dst_stride_b; - } -} - -void TransposeWxH_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width, - int height) { - int i; - for (i = 0; i < width; ++i) { - int j; - for (j = 0; j < height; ++j) { - dst[i * dst_stride + j] = src[j * src_stride + i]; - } - } -} - -void TransposeUVWxH_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { - int i; - for (i = 0; i < width * 2; i += 2) { - int j; - for (j = 0; j < height; ++j) { - dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; - dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; - } - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_gcc.cc b/thirdparty/libyuv/source/rotate_gcc.cc deleted file mode 100644 index 8401d4f..0000000 --- a/thirdparty/libyuv/source/rotate_gcc.cc +++ /dev/null @@ -1,374 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) - -// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. -#if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // defined(HAS_TRANSPOSEWX8_SSSE3) - -// Transpose 16x8. 64 bit -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", - "xmm15"); -} -#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) - -// Transpose UV 8x8. 64 bit. -#if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9"); -} -#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) -#endif // defined(__x86_64__) || defined(__i386__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_mmi.cc b/thirdparty/libyuv/source/rotate_mmi.cc deleted file mode 100644 index f8de608..0000000 --- a/thirdparty/libyuv/source/rotate_mmi.cc +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -void TransposeWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (00 10 01 11 02 12 03 13) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (04 14 05 15 06 16 07 17) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (20 30 21 31 22 32 23 33) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (24 34 25 35 26 36 27 37) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (00 10 20 30 01 11 21 31) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (02 12 22 32 03 13 23 33) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (04 14 24 34 05 15 25 35) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (06 16 26 36 07 17 27 37) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (40 50 41 51 42 52 43 53) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (44 54 45 55 46 56 47 57) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (60 70 61 71 62 72 63 73) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (64 74 65 75 66 76 67 77) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (40 50 60 70 41 51 61 71) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (42 52 62 72 43 53 63 73) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (44 54 64 74 45 55 65 75) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (46 56 66 76 47 57 67 77) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (00 10 20 30 40 50 60 70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (01 11 21 31 41 51 61 71) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (02 12 22 32 42 52 62 72) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (03 13 23 33 43 53 63 73) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (04 14 24 34 44 54 64 74) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (05 15 25 35 45 55 65 75) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (06 16 26 36 46 56 66 76) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (07 17 27 37 47 57 67 77) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "daddi %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), - [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), - [dst_stride] "r"(dst_stride) - : "memory"); -} - -void TransposeUVWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "daddiu %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), - [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), - [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) - : "memory"); -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_msa.cc b/thirdparty/libyuv/source/rotate_msa.cc deleted file mode 100644 index 99bdca6..0000000 --- a/thirdparty/libyuv/source/rotate_msa.cc +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "libyuv/macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ - out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ - out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ - out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ - } - -#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ - out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ - out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ - out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ - } - -#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ - out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ - out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ - out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ - } - -#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ - out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ - } - -void TransposeWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - TransposeWx8_C(src, src_stride, dst, dst_stride, width); - TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, - width); -} - -void TransposeUVWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, - width); - TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), - dst_stride_a, (dst_b + 8), dst_stride_b, width); -} - -void TransposeWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - int x; - const uint8_t* s; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < width; x += 16) { - s = src; - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); - ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); - ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); - ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - dst += dst_stride * 4; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); - ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); - src += 16; - dst += dst_stride * 4; - } -} - -void TransposeUVWx16_MSA(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - int x; - const uint8_t* s; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; - - for (x = 0; x < width; x += 8) { - s = src; - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); - ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); - s += src_stride; - ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); - res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); - ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); - ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); - ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); - res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); - ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); - ST_UB2(dst0, dst2, dst_a, dst_stride_a); - ST_UB2(dst1, dst3, dst_b, dst_stride_b); - src += 16; - dst_a += dst_stride_a * 2; - dst_b += dst_stride_b * 2; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/thirdparty/libyuv/source/rotate_neon.cc b/thirdparty/libyuv/source/rotate_neon.cc deleted file mode 100644 index 844df2b..0000000 --- a/thirdparty/libyuv/source/rotate_neon.cc +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; - -void TransposeWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* src_temp; - asm volatile( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %5, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" - - "vld1.8 {d0}, [%0], %2 \n" - "vld1.8 {d1}, [%0], %2 \n" - "vld1.8 {d2}, [%0], %2 \n" - "vld1.8 {d3}, [%0], %2 \n" - "vld1.8 {d4}, [%0], %2 \n" - "vld1.8 {d5}, [%0], %2 \n" - "vld1.8 {d6}, [%0], %2 \n" - "vld1.8 {d7}, [%0] \n" - - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" - - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - - "mov %0, %3 \n" - - "vst1.8 {d1}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d3}, [%0], %4 \n" - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d5}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d7}, [%0], %4 \n" - "vst1.8 {d6}, [%0] \n" - - "add %1, #8 \n" // src += 8 - "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride - "subs %5, #8 \n" // w -= 8 - "bge 1b \n" - - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %5, #8 \n" - "beq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %5, #2 \n" - "blt 3f \n" - - "cmp %5, #4 \n" - "blt 2f \n" - - // 4x8 block - "mov %0, %1 \n" - "vld1.32 {d0[0]}, [%0], %2 \n" - "vld1.32 {d0[1]}, [%0], %2 \n" - "vld1.32 {d1[0]}, [%0], %2 \n" - "vld1.32 {d1[1]}, [%0], %2 \n" - "vld1.32 {d2[0]}, [%0], %2 \n" - "vld1.32 {d2[1]}, [%0], %2 \n" - "vld1.32 {d3[0]}, [%0], %2 \n" - "vld1.32 {d3[1]}, [%0] \n" - - "mov %0, %3 \n" - - "vld1.8 {q3}, [%6] \n" - - "vtbl.8 d4, {d0, d1}, d6 \n" - "vtbl.8 d5, {d0, d1}, d7 \n" - "vtbl.8 d0, {d2, d3}, d6 \n" - "vtbl.8 d1, {d2, d3}, d7 \n" - - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - "vst1.32 {d4[0]}, [%0], %4 \n" - "vst1.32 {d4[1]}, [%0], %4 \n" - "vst1.32 {d5[0]}, [%0], %4 \n" - "vst1.32 {d5[1]}, [%0] \n" - - "add %0, %3, #4 \n" - "vst1.32 {d0[0]}, [%0], %4 \n" - "vst1.32 {d0[1]}, [%0], %4 \n" - "vst1.32 {d1[0]}, [%0], %4 \n" - "vst1.32 {d1[1]}, [%0] \n" - - "add %1, #4 \n" // src += 4 - "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride - "subs %5, #4 \n" // w -= 4 - "beq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %5, #2 \n" - "blt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "vld1.16 {d0[0]}, [%0], %2 \n" - "vld1.16 {d1[0]}, [%0], %2 \n" - "vld1.16 {d0[1]}, [%0], %2 \n" - "vld1.16 {d1[1]}, [%0], %2 \n" - "vld1.16 {d0[2]}, [%0], %2 \n" - "vld1.16 {d1[2]}, [%0], %2 \n" - "vld1.16 {d0[3]}, [%0], %2 \n" - "vld1.16 {d1[3]}, [%0] \n" - - "vtrn.8 d0, d1 \n" - - "mov %0, %3 \n" - - "vst1.64 {d0}, [%0], %4 \n" - "vst1.64 {d1}, [%0] \n" - - "add %1, #2 \n" // src += 2 - "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride - "subs %5, #2 \n" // w -= 2 - "beq 4f \n" - - // 1x8 block - "3: \n" - "vld1.8 {d0[0]}, [%1], %2 \n" - "vld1.8 {d0[1]}, [%1], %2 \n" - "vld1.8 {d0[2]}, [%1], %2 \n" - "vld1.8 {d0[3]}, [%1], %2 \n" - "vld1.8 {d0[4]}, [%1], %2 \n" - "vld1.8 {d0[5]}, [%1], %2 \n" - "vld1.8 {d0[6]}, [%1], %2 \n" - "vld1.8 {d0[7]}, [%1] \n" - - "vst1.64 {d0}, [%3] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst), // %3 - "+r"(dst_stride), // %4 - "+r"(width) // %5 - : "r"(&kVTbl4x4Transpose) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3"); -} - -static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, - 4, 12, 5, 13, 6, 14, 7, 15}; - -void TransposeUVWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - const uint8_t* src_temp; - asm volatile( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %7, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" - - "vld2.8 {d0, d1}, [%0], %2 \n" - "vld2.8 {d2, d3}, [%0], %2 \n" - "vld2.8 {d4, d5}, [%0], %2 \n" - "vld2.8 {d6, d7}, [%0], %2 \n" - "vld2.8 {d16, d17}, [%0], %2 \n" - "vld2.8 {d18, d19}, [%0], %2 \n" - "vld2.8 {d20, d21}, [%0], %2 \n" - "vld2.8 {d22, d23}, [%0] \n" - - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" - - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" - - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" - - "mov %0, %3 \n" - - "vst1.8 {d2}, [%0], %4 \n" - "vst1.8 {d0}, [%0], %4 \n" - "vst1.8 {d6}, [%0], %4 \n" - "vst1.8 {d4}, [%0], %4 \n" - "vst1.8 {d18}, [%0], %4 \n" - "vst1.8 {d16}, [%0], %4 \n" - "vst1.8 {d22}, [%0], %4 \n" - "vst1.8 {d20}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.8 {d3}, [%0], %6 \n" - "vst1.8 {d1}, [%0], %6 \n" - "vst1.8 {d7}, [%0], %6 \n" - "vst1.8 {d5}, [%0], %6 \n" - "vst1.8 {d19}, [%0], %6 \n" - "vst1.8 {d17}, [%0], %6 \n" - "vst1.8 {d23}, [%0], %6 \n" - "vst1.8 {d21}, [%0] \n" - - "add %1, #8*2 \n" // src += 8*2 - "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * - // dst_stride_a - "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * - // dst_stride_b - "subs %7, #8 \n" // w -= 8 - "bge 1b \n" - - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %7, #8 \n" - "beq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %7, #2 \n" - "blt 3f \n" - - "cmp %7, #4 \n" - "blt 2f \n" - - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - "vld1.64 {d0}, [%0], %2 \n" - "vld1.64 {d1}, [%0], %2 \n" - "vld1.64 {d2}, [%0], %2 \n" - "vld1.64 {d3}, [%0], %2 \n" - "vld1.64 {d4}, [%0], %2 \n" - "vld1.64 {d5}, [%0], %2 \n" - "vld1.64 {d6}, [%0], %2 \n" - "vld1.64 {d7}, [%0] \n" - - "vld1.8 {q15}, [%8] \n" - - "vtrn.8 q0, q1 \n" - "vtrn.8 q2, q3 \n" - - "vtbl.8 d16, {d0, d1}, d30 \n" - "vtbl.8 d17, {d0, d1}, d31 \n" - "vtbl.8 d18, {d2, d3}, d30 \n" - "vtbl.8 d19, {d2, d3}, d31 \n" - "vtbl.8 d20, {d4, d5}, d30 \n" - "vtbl.8 d21, {d4, d5}, d31 \n" - "vtbl.8 d22, {d6, d7}, d30 \n" - "vtbl.8 d23, {d6, d7}, d31 \n" - - "mov %0, %3 \n" - - "vst1.32 {d16[0]}, [%0], %4 \n" - "vst1.32 {d16[1]}, [%0], %4 \n" - "vst1.32 {d17[0]}, [%0], %4 \n" - "vst1.32 {d17[1]}, [%0], %4 \n" - - "add %0, %3, #4 \n" - "vst1.32 {d20[0]}, [%0], %4 \n" - "vst1.32 {d20[1]}, [%0], %4 \n" - "vst1.32 {d21[0]}, [%0], %4 \n" - "vst1.32 {d21[1]}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.32 {d18[0]}, [%0], %6 \n" - "vst1.32 {d18[1]}, [%0], %6 \n" - "vst1.32 {d19[0]}, [%0], %6 \n" - "vst1.32 {d19[1]}, [%0], %6 \n" - - "add %0, %5, #4 \n" - "vst1.32 {d22[0]}, [%0], %6 \n" - "vst1.32 {d22[1]}, [%0], %6 \n" - "vst1.32 {d23[0]}, [%0], %6 \n" - "vst1.32 {d23[1]}, [%0] \n" - - "add %1, #4*2 \n" // src += 4 * 2 - "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * - // dst_stride_a - "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * - // dst_stride_b - "subs %7, #4 \n" // w -= 4 - "beq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %7, #2 \n" - "blt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" - "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" - "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" - "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" - "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" - "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" - "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" - "vld2.16 {d1[3], d3[3]}, [%0] \n" - - "vtrn.8 d0, d1 \n" - "vtrn.8 d2, d3 \n" - - "mov %0, %3 \n" - - "vst1.64 {d0}, [%0], %4 \n" - "vst1.64 {d2}, [%0] \n" - - "mov %0, %5 \n" - - "vst1.64 {d1}, [%0], %6 \n" - "vst1.64 {d3}, [%0] \n" - - "add %1, #2*2 \n" // src += 2 * 2 - "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * - // dst_stride_a - "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * - // dst_stride_b - "subs %7, #2 \n" // w -= 2 - "beq 4f \n" - - // 1x8 block - "3: \n" - "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" - "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" - "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" - "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" - "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" - "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" - "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" - "vld2.8 {d0[7], d1[7]}, [%1] \n" - - "vst1.64 {d0}, [%3] \n" - "vst1.64 {d1}, [%5] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst_a), // %3 - "+r"(dst_stride_a), // %4 - "+r"(dst_b), // %5 - "+r"(dst_stride_b), // %6 - "+r"(width) // %7 - : "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_neon64.cc b/thirdparty/libyuv/source/rotate_neon64.cc deleted file mode 100644 index 43c1581..0000000 --- a/thirdparty/libyuv/source/rotate_neon64.cc +++ /dev/null @@ -1,443 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; - -void TransposeWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - const uint8_t* src_temp; - asm volatile( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w3, %w3, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" - - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" - "mov %0, %1 \n" - - "trn2 v16.8b, v0.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "trn1 v17.8b, v0.8b, v1.8b \n" - "add %0, %0, %5 \n" - "trn2 v18.8b, v2.8b, v3.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 1 - "trn1 v19.8b, v2.8b, v3.8b \n" - "add %0, %0, %5 \n" - "trn2 v20.8b, v4.8b, v5.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 2 - "trn1 v21.8b, v4.8b, v5.8b \n" - "add %0, %0, %5 \n" - "trn2 v22.8b, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 3 - "trn1 v23.8b, v6.8b, v7.8b \n" - "add %0, %0, %5 \n" - - "trn2 v3.4h, v17.4h, v19.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 4 - "trn1 v1.4h, v17.4h, v19.4h \n" - "add %0, %0, %5 \n" - "trn2 v2.4h, v16.4h, v18.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 5 - "trn1 v0.4h, v16.4h, v18.4h \n" - "add %0, %0, %5 \n" - "trn2 v7.4h, v21.4h, v23.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 6 - "trn1 v5.4h, v21.4h, v23.4h \n" - "add %0, %0, %5 \n" - "trn2 v6.4h, v20.4h, v22.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 7 - "trn1 v4.4h, v20.4h, v22.4h \n" - - "trn2 v21.2s, v1.2s, v5.2s \n" - "trn1 v17.2s, v1.2s, v5.2s \n" - "trn2 v20.2s, v0.2s, v4.2s \n" - "trn1 v16.2s, v0.2s, v4.2s \n" - "trn2 v23.2s, v3.2s, v7.2s \n" - "trn1 v19.2s, v3.2s, v7.2s \n" - "trn2 v22.2s, v2.2s, v6.2s \n" - "trn1 v18.2s, v2.2s, v6.2s \n" - - "mov %0, %2 \n" - - "st1 {v17.8b}, [%0], %6 \n" - "st1 {v16.8b}, [%0], %6 \n" - "st1 {v19.8b}, [%0], %6 \n" - "st1 {v18.8b}, [%0], %6 \n" - "st1 {v21.8b}, [%0], %6 \n" - "st1 {v20.8b}, [%0], %6 \n" - "st1 {v23.8b}, [%0], %6 \n" - "st1 {v22.8b}, [%0] \n" - - "add %1, %1, #8 \n" // src += 8 - "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %w3, %w3, #8 \n" // w -= 8 - "b.ge 1b \n" - - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" - - "cmp %w3, #4 \n" - "b.lt 2f \n" - - // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" - - "mov %0, %2 \n" - - "ld1 {v2.16b}, [%4] \n" - - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" - - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - "st1 {v3.s}[0], [%0], %6 \n" - "st1 {v3.s}[1], [%0], %6 \n" - "st1 {v3.s}[2], [%0], %6 \n" - "st1 {v3.s}[3], [%0] \n" - - "add %0, %2, #4 \n" - "st1 {v0.s}[0], [%0], %6 \n" - "st1 {v0.s}[1], [%0], %6 \n" - "st1 {v0.s}[2], [%0], %6 \n" - "st1 {v0.s}[3], [%0] \n" - - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %w3, %w3, #4 \n" // w -= 4 - "b.eq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %w3, #2 \n" - "b.lt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "ld1 {v0.h}[0], [%0], %5 \n" - "ld1 {v1.h}[0], [%0], %5 \n" - "ld1 {v0.h}[1], [%0], %5 \n" - "ld1 {v1.h}[1], [%0], %5 \n" - "ld1 {v0.h}[2], [%0], %5 \n" - "ld1 {v1.h}[2], [%0], %5 \n" - "ld1 {v0.h}[3], [%0], %5 \n" - "ld1 {v1.h}[3], [%0] \n" - - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" - - "mov %0, %2 \n" - - "st1 {v3.8b}, [%0], %6 \n" - "st1 {v2.8b}, [%0] \n" - - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %w3, %w3, #2 \n" // w -= 2 - "b.eq 4f \n" - - // 1x8 block - "3: \n" - "ld1 {v0.b}[0], [%1], %5 \n" - "ld1 {v0.b}[1], [%1], %5 \n" - "ld1 {v0.b}[2], [%1], %5 \n" - "ld1 {v0.b}[3], [%1], %5 \n" - "ld1 {v0.b}[4], [%1], %5 \n" - "ld1 {v0.b}[5], [%1], %5 \n" - "ld1 {v0.b}[6], [%1], %5 \n" - "ld1 {v0.b}[7], [%1] \n" - - "st1 {v0.8b}, [%2] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23"); -} - -static const uint8_t kVTbl4x4TransposeDi[32] = { - 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, - 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; - -void TransposeUVWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - const uint8_t* src_temp; - asm volatile( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w4, %w4, #8 \n" - - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" - - "ld1 {v0.16b}, [%0], %5 \n" - "ld1 {v1.16b}, [%0], %5 \n" - "ld1 {v2.16b}, [%0], %5 \n" - "ld1 {v3.16b}, [%0], %5 \n" - "ld1 {v4.16b}, [%0], %5 \n" - "ld1 {v5.16b}, [%0], %5 \n" - "ld1 {v6.16b}, [%0], %5 \n" - "ld1 {v7.16b}, [%0] \n" - "mov %0, %1 \n" - - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" - - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" - - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" - - "mov %0, %2 \n" - - "st1 {v16.d}[0], [%0], %6 \n" - "st1 {v18.d}[0], [%0], %6 \n" - "st1 {v17.d}[0], [%0], %6 \n" - "st1 {v19.d}[0], [%0], %6 \n" - "st1 {v16.d}[1], [%0], %6 \n" - "st1 {v18.d}[1], [%0], %6 \n" - "st1 {v17.d}[1], [%0], %6 \n" - "st1 {v19.d}[1], [%0] \n" - - "mov %0, %3 \n" - - "st1 {v20.d}[0], [%0], %7 \n" - "st1 {v22.d}[0], [%0], %7 \n" - "st1 {v21.d}[0], [%0], %7 \n" - "st1 {v23.d}[0], [%0], %7 \n" - "st1 {v20.d}[1], [%0], %7 \n" - "st1 {v22.d}[1], [%0], %7 \n" - "st1 {v21.d}[1], [%0], %7 \n" - "st1 {v23.d}[1], [%0] \n" - - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * - // dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * - // dst_stride_b - "subs %w4, %w4, #8 \n" // w -= 8 - "b.ge 1b \n" - - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %w4, %w4, #8 \n" - "b.eq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %w4, #2 \n" - "b.lt 3f \n" - - "cmp %w4, #4 \n" - "b.lt 2f \n" - - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" - - "ld1 {v30.16b}, [%8], #16 \n" - "ld1 {v31.16b}, [%8] \n" - - "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" - "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" - "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" - "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" - - "mov %0, %2 \n" - - "st1 {v16.s}[0], [%0], %6 \n" - "st1 {v16.s}[1], [%0], %6 \n" - "st1 {v16.s}[2], [%0], %6 \n" - "st1 {v16.s}[3], [%0], %6 \n" - - "add %0, %2, #4 \n" - "st1 {v18.s}[0], [%0], %6 \n" - "st1 {v18.s}[1], [%0], %6 \n" - "st1 {v18.s}[2], [%0], %6 \n" - "st1 {v18.s}[3], [%0] \n" - - "mov %0, %3 \n" - - "st1 {v17.s}[0], [%0], %7 \n" - "st1 {v17.s}[1], [%0], %7 \n" - "st1 {v17.s}[2], [%0], %7 \n" - "st1 {v17.s}[3], [%0], %7 \n" - - "add %0, %3, #4 \n" - "st1 {v19.s}[0], [%0], %7 \n" - "st1 {v19.s}[1], [%0], %7 \n" - "st1 {v19.s}[2], [%0], %7 \n" - "st1 {v19.s}[3], [%0] \n" - - "add %1, %1, #8 \n" // src += 4 * 2 - "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * - // dst_stride_a - "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * - // dst_stride_b - "subs %w4, %w4, #4 \n" // w -= 4 - "b.eq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %w4, #2 \n" - "b.lt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "ld2 {v0.h, v1.h}[0], [%0], %5 \n" - "ld2 {v2.h, v3.h}[0], [%0], %5 \n" - "ld2 {v0.h, v1.h}[1], [%0], %5 \n" - "ld2 {v2.h, v3.h}[1], [%0], %5 \n" - "ld2 {v0.h, v1.h}[2], [%0], %5 \n" - "ld2 {v2.h, v3.h}[2], [%0], %5 \n" - "ld2 {v0.h, v1.h}[3], [%0], %5 \n" - "ld2 {v2.h, v3.h}[3], [%0] \n" - - "trn1 v4.8b, v0.8b, v2.8b \n" - "trn2 v5.8b, v0.8b, v2.8b \n" - "trn1 v6.8b, v1.8b, v3.8b \n" - "trn2 v7.8b, v1.8b, v3.8b \n" - - "mov %0, %2 \n" - - "st1 {v4.d}[0], [%0], %6 \n" - "st1 {v6.d}[0], [%0] \n" - - "mov %0, %3 \n" - - "st1 {v5.d}[0], [%0], %7 \n" - "st1 {v7.d}[0], [%0] \n" - - "add %1, %1, #4 \n" // src += 2 * 2 - "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * - // dst_stride_a - "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * - // dst_stride_b - "subs %w4, %w4, #2 \n" // w -= 2 - "b.eq 4f \n" - - // 1x8 block - "3: \n" - "ld2 {v0.b, v1.b}[0], [%1], %5 \n" - "ld2 {v0.b, v1.b}[1], [%1], %5 \n" - "ld2 {v0.b, v1.b}[2], [%1], %5 \n" - "ld2 {v0.b, v1.b}[3], [%1], %5 \n" - "ld2 {v0.b, v1.b}[4], [%1], %5 \n" - "ld2 {v0.b, v1.b}[5], [%1], %5 \n" - "ld2 {v0.b, v1.b}[6], [%1], %5 \n" - "ld2 {v0.b, v1.b}[7], [%1] \n" - - "st1 {v0.d}[0], [%2] \n" - "st1 {v1.d}[0], [%3] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); -} -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/rotate_win.cc b/thirdparty/libyuv/source/rotate_win.cc deleted file mode 100644 index a78873f..0000000 --- a/thirdparty/libyuv/source/rotate_win.cc +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - - pop ebp - pop esi - pop edi - ret - } -} - -__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - // Read in the data from the source pointer. - // First round of bit swap. - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_any.cc b/thirdparty/libyuv/source/row_any.cc deleted file mode 100644 index c9a402e..0000000 --- a/thirdparty/libyuv/source/row_any.cc +++ /dev/null @@ -1,2071 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#include // For memset. - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// memset for temp is meant to clear the source buffer (not dest) so that -// SIMD that reads full multiple of 16 bytes will not trigger msan errors. -// memset is not needed for production, as the garbage values are processed but -// not used, although there may be edge cases for subsampling. -// The size of the buffer is based on the largest read, which can be inferred -// by the source type (e.g. ARGB) and the mask (last parameter), or by examining -// the source code for how much the source pointers are advanced. - -// Subsampled source needs to be increase by 1 of not even. -#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) - -// Any 4 planes to 1 -#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_MERGEARGBROW_SSE2 -ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) -#endif -#ifdef HAS_MERGEARGBROW_AVX2 -ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) -#endif -#ifdef HAS_MERGEARGBROW_NEON -ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) -#endif - -// Note that odd width replication includes 444 due to implementation -// on arm that subsamples 444 to 422 internally. -// Any 4 planes to 1 with yuvconstants -#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) -#endif -#ifdef HAS_I444ALPHATOARGBROW_AVX2 -ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) -#endif -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422ALPHATOARGBROW_AVX2 -ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I444ALPHATOARGBROW_NEON -ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7) -#endif -#ifdef HAS_I422ALPHATOARGBROW_NEON -ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) -#endif -#ifdef HAS_I444ALPHATOARGBROW_MSA -ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) -#endif -#ifdef HAS_I422ALPHATOARGBROW_MSA -ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) -#endif -#ifdef HAS_I444ALPHATOARGBROW_MMI -ANY41C(I444AlphaToARGBRow_Any_MMI, I444AlphaToARGBRow_MMI, 0, 0, 4, 7) -#endif -#ifdef HAS_I422ALPHATOARGBROW_MMI -ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7) -#endif -#undef ANY41C - -// Any 4 planes to 1 plane of 8 bit with yuvconstants -#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ - void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ - uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 48, a_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ - MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -ANY41CT(I210AlphaToARGBRow_Any_SSSE3, - I210AlphaToARGBRow_SSSE3, - 1, - 0, - uint16_t, - 2, - 4, - 7) -#endif - -#ifdef HAS_I210ALPHATOARGBROW_AVX2 -ANY41CT(I210AlphaToARGBRow_Any_AVX2, - I210AlphaToARGBRow_AVX2, - 1, - 0, - uint16_t, - 2, - 4, - 15) -#endif - -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -ANY41CT(I410AlphaToARGBRow_Any_SSSE3, - I410AlphaToARGBRow_SSSE3, - 0, - 0, - uint16_t, - 2, - 4, - 7) -#endif - -#ifdef HAS_I410ALPHATOARGBROW_AVX2 -ANY41CT(I410AlphaToARGBRow_Any_AVX2, - I410AlphaToARGBRow_AVX2, - 0, - 0, - uint16_t, - 2, - 4, - 15) -#endif - -#undef ANY41CT - -// Any 4 planes to 1 plane with parameter -#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ - void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ - const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ - SIMD_ALIGNED(STYPE temp[16 * 4]); \ - SIMD_ALIGNED(DTYPE out[64]); \ - memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ - } \ - memcpy(temp, r_buf + n, r * SBPP); \ - memcpy(temp + 16, g_buf + n, r * SBPP); \ - memcpy(temp + 32, b_buf + n, r * SBPP); \ - memcpy(temp + 48, a_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ - memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ - } - -#ifdef HAS_MERGEAR64ROW_AVX2 -ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) -#endif - -#ifdef HAS_MERGEAR64ROW_NEON -ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) -#endif - -#ifdef HAS_MERGEARGB16TO8ROW_AVX2 -ANY41PT(MergeARGB16To8Row_Any_AVX2, - MergeARGB16To8Row_AVX2, - uint16_t, - 2, - uint8_t, - 4, - 15) -#endif - -#ifdef HAS_MERGEARGB16TO8ROW_NEON -ANY41PT(MergeARGB16To8Row_Any_NEON, - MergeARGB16To8Row_NEON, - uint16_t, - 2, - uint8_t, - 4, - 7) -#endif - -#undef ANY41PT - -// Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } - -// Merge functions. -#ifdef HAS_MERGERGBROW_SSSE3 -ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) -#endif -#ifdef HAS_MERGERGBROW_NEON -ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) -#endif -#ifdef HAS_MERGERGBROW_MMI -ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) -#endif -#ifdef HAS_MERGEXRGBROW_SSE2 -ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) -#endif -#ifdef HAS_MERGEXRGBROW_AVX2 -ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) -#endif -#ifdef HAS_MERGEXRGBROW_NEON -ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) -#endif -#ifdef HAS_I422TOYUY2ROW_SSE2 -ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) -ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) -#endif -#ifdef HAS_I422TOYUY2ROW_AVX2 -ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) -ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) -#endif -#ifdef HAS_I422TOYUY2ROW_NEON -ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) -#endif -#ifdef HAS_I422TOYUY2ROW_MSA -ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) -#endif -#ifdef HAS_I422TOYUY2ROW_MMI -ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) -#endif -#ifdef HAS_I422TOUYVYROW_NEON -ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) -#endif -#ifdef HAS_I422TOUYVYROW_MSA -ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) -#endif -#ifdef HAS_I422TOUYVYROW_MMI -ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) -#endif -#ifdef HAS_BLENDPLANEROW_AVX2 -ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) -#endif -#ifdef HAS_BLENDPLANEROW_SSSE3 -ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) -#endif -#ifdef HAS_BLENDPLANEROW_MMI -ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) -#endif -#undef ANY31 - -// Note that odd width replication includes 444 due to implementation -// on arm that subsamples 444 to 422 internally. -// Any 3 planes to 1 with yuvconstants -#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 4]); \ - memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ - MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ - SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_I422TOARGBROW_SSSE3 -ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TORGBAROW_SSSE3 -ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TOARGB4444ROW_SSSE3 -ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOARGB1555ROW_SSSE3 -ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB565ROW_SSSE3 -ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB24ROW_SSSE3 -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) -#endif -#ifdef HAS_I422TOAR30ROW_SSSE3 -ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TOAR30ROW_AVX2 -ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I444TOARGBROW_SSSE3 -ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) -#endif -#ifdef HAS_I422TORGB24ROW_AVX2 -ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) -#endif -#ifdef HAS_I422TOARGBROW_AVX2 -ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I422TORGBAROW_AVX2 -ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) -#endif -#ifdef HAS_I444TOARGBROW_AVX2 -ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) -#endif -#ifdef HAS_I422TOARGB4444ROW_AVX2 -ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) -#endif -#ifdef HAS_I422TOARGB1555ROW_AVX2 -ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) -#endif -#ifdef HAS_I422TORGB565ROW_AVX2 -ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) -#endif -#ifdef HAS_I422TOARGBROW_NEON -ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) -ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) -ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) -ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) -ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) -ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) -ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOARGBROW_MSA -ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) -ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) -ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) -ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) -ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) -ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) -ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOARGBROW_MMI -ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7) -ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7) -ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15) -ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7) -ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7) -ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7) -ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7) -#endif -#undef ANY31C - -// Any 3 planes of 16 bit to 1 with yuvconstants -// TODO(fbarchard): consider sharing this code with ANY31C -#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ - void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ - uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(T temp[16 * 3]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_I210TOAR30ROW_SSSE3 -ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I210TOARGBROW_SSSE3 -ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I210TOARGBROW_AVX2 -ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_I210TOAR30ROW_AVX2 -ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_I410TOAR30ROW_SSSE3 -ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I410TOARGBROW_SSSE3 -ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I410TOARGBROW_AVX2 -ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_I410TOAR30ROW_AVX2 -ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_I210TOARGBROW_MMI -ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I212TOAR30ROW_SSSE3 -ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I212TOARGBROW_SSSE3 -ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I212TOARGBROW_AVX2 -ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_I212TOAR30ROW_AVX2 -ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#undef ANY31CT - -// Any 3 planes to 1 plane with parameter -#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ - void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ - DTYPE* dst_ptr, int depth, int width) { \ - SIMD_ALIGNED(STYPE temp[16 * 3]); \ - SIMD_ALIGNED(DTYPE out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ - } \ - memcpy(temp, r_buf + n, r * SBPP); \ - memcpy(temp + 16, g_buf + n, r * SBPP); \ - memcpy(temp + 32, b_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ - memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ - } - -#ifdef HAS_MERGEXR30ROW_AVX2 -ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) -#endif - -#ifdef HAS_MERGEXR30ROW_NEON -ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) -ANY31PT(MergeXR30Row_10_Any_NEON, - MergeXR30Row_10_NEON, - uint16_t, - 2, - uint8_t, - 4, - 3) -#endif - -#ifdef HAS_MERGEXR64ROW_AVX2 -ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) -#endif - -#ifdef HAS_MERGEXR64ROW_NEON -ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) -#endif - -#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 -ANY31PT(MergeXRGB16To8Row_Any_AVX2, - MergeXRGB16To8Row_AVX2, - uint16_t, - 2, - uint8_t, - 4, - 15) -#endif - -#ifdef HAS_MERGEXRGB16TO8ROW_NEON -ANY31PT(MergeXRGB16To8Row_Any_NEON, - MergeXRGB16To8Row_NEON, - uint16_t, - 2, - uint8_t, - 4, - 7) -#endif - -#undef ANY31PT - -// Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } - -// Merge functions. -#ifdef HAS_MERGEUVROW_SSE2 -ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) -#endif -#ifdef HAS_MERGEUVROW_AVX2 -ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) -#endif -#ifdef HAS_MERGEUVROW_NEON -ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) -#endif -#ifdef HAS_MERGEUVROW_MSA -ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) -#endif -#ifdef HAS_MERGEUVROW_MMI -ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) -#endif -#ifdef HAS_NV21TOYUV24ROW_NEON -ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) -#endif -#ifdef HAS_NV21TOYUV24ROW_AVX2 -ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) -#endif -// Math functions. -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBADDROW_SSE2 -ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBADDROW_AVX2 -ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBMULTIPLYROW_NEON -ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBADDROW_NEON -ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBSUBTRACTROW_NEON -ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBMULTIPLYROW_MSA -ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBMULTIPLYROW_MMI -ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) -#endif -#ifdef HAS_ARGBADDROW_MSA -ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBADDROW_MMI -ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) -#endif -#ifdef HAS_ARGBSUBTRACTROW_MSA -ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) -#endif -#ifdef HAS_ARGBSUBTRACTROW_MMI -ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) -#endif -#ifdef HAS_SOBELROW_SSE2 -ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) -#endif -#ifdef HAS_SOBELROW_NEON -ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) -#endif -#ifdef HAS_SOBELROW_MSA -ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) -#endif -#ifdef HAS_SOBELROW_MMI -ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7) -#endif -#ifdef HAS_SOBELTOPLANEROW_SSE2 -ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) -#endif -#ifdef HAS_SOBELTOPLANEROW_NEON -ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) -#endif -#ifdef HAS_SOBELTOPLANEROW_MSA -ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) -#endif -#ifdef HAS_SOBELTOPLANEROW_MMI -ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7) -#endif -#ifdef HAS_SOBELXYROW_SSE2 -ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) -#endif -#ifdef HAS_SOBELXYROW_NEON -ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) -#endif -#ifdef HAS_SOBELXYROW_MSA -ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) -#endif -#ifdef HAS_SOBELXYROW_MMI -ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7) -#endif -#undef ANY21 - -// Any 2 planes to 1 with yuvconstants -#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ - } - -// Biplanar to RGB. -#ifdef HAS_NV12TOARGBROW_SSSE3 -ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TOARGBROW_AVX2 -ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) -#endif -#ifdef HAS_NV12TOARGBROW_NEON -ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TOARGBROW_MSA -ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TOARGBROW_MMI -ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV21TOARGBROW_SSSE3 -ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV21TOARGBROW_AVX2 -ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) -#endif -#ifdef HAS_NV21TOARGBROW_NEON -ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV21TOARGBROW_MSA -ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV21TOARGBROW_MMI -ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7) -#endif -#ifdef HAS_NV12TORGB24ROW_NEON -ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) -#endif -#ifdef HAS_NV21TORGB24ROW_NEON -ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) -#endif -#ifdef HAS_NV12TORGB24ROW_SSSE3 -ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif -#ifdef HAS_NV12TORGB24ROW_MMI -ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7) -#endif -#ifdef HAS_NV21TORGB24ROW_SSSE3 -ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif -#ifdef HAS_NV12TORGB24ROW_AVX2 -ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) -#endif -#ifdef HAS_NV21TORGB24ROW_AVX2 -ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) -#endif -#ifdef HAS_NV21TORGB24ROW_MMI -ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_SSSE3 -ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_AVX2 -ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) -#endif -#ifdef HAS_NV12TORGB565ROW_NEON -ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_MSA -ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) -#endif -#ifdef HAS_NV12TORGB565ROW_MMI -ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7) -#endif -#undef ANY21C - -// Any 2 planes of 16 bit to 1 with yuvconstants -#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ - void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(T temp[16 * 3]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ - ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ - } - -#ifdef HAS_P210TOAR30ROW_SSSE3 -ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P210TOARGBROW_SSSE3 -ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P210TOARGBROW_AVX2 -ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_P210TOAR30ROW_AVX2 -ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_P410TOAR30ROW_SSSE3 -ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P410TOARGBROW_SSSE3 -ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P410TOARGBROW_AVX2 -ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) -#endif -#ifdef HAS_P410TOAR30ROW_AVX2 -ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) -#endif - -#undef ANY21CT - -// Any 2 16 bit planes with parameter to 1 -#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ - int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ - } \ - memcpy(temp, src_u + n, r * BPP); \ - memcpy(temp + 16, src_v + n, r * BPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \ - memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \ - } - -#ifdef HAS_MERGEUVROW_16_AVX2 -ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) -#endif -#ifdef HAS_MERGEUVROW_16_NEON -ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) -#endif - -#undef ANY21CT - -// Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } - -#ifdef HAS_COPYROW_AVX -ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) -#endif -#ifdef HAS_COPYROW_SSE2 -ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) -#endif -#ifdef HAS_COPYROW_NEON -ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) -#endif -#if defined(HAS_ARGBTORGB24ROW_SSSE3) -ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) -#endif -#if defined(HAS_ARGBTORGB24ROW_AVX2) -ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) -#endif -#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) -ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) -#endif -#if defined(HAS_ARGBTORAWROW_AVX2) -ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) -#endif -#if defined(HAS_ARGBTORGB565ROW_AVX2) -ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) -#endif -#if defined(HAS_ARGBTOARGB4444ROW_AVX2) -ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) -ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) -#endif -#if defined(HAS_ABGRTOAR30ROW_SSSE3) -ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) -#endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) -ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) -#endif -#if defined(HAS_ABGRTOAR30ROW_AVX2) -ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) -#endif -#if defined(HAS_ARGBTOAR30ROW_AVX2) -ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) -#endif -#if defined(HAS_J400TOARGBROW_SSE2) -ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif -#if defined(HAS_J400TOARGBROW_AVX2) -ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) -#endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) -ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) -#endif -#if defined(HAS_RAWTORGBAROW_SSSE3) -ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) -#endif -#if defined(HAS_RAWTORGB24ROW_SSSE3) -ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) -#endif -#if defined(HAS_RGB565TOARGBROW_AVX2) -ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) -#endif -#if defined(HAS_ARGB1555TOARGBROW_AVX2) -ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) -#endif -#if defined(HAS_ARGB4444TOARGBROW_AVX2) -ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) -#endif -#if defined(HAS_ARGBTORGB24ROW_NEON) -ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) -ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) -ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) -ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) -ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) -ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) -#endif -#if defined(HAS_ARGBTORGB24ROW_MSA) -ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) -ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) -ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) -ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) -ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) -ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) -#endif -#if defined(HAS_ARGBTORGB24ROW_MMI) -ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) -ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3) -ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) -ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) -#endif -#if defined(HAS_RAWTORGB24ROW_NEON) -ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) -#endif -#if defined(HAS_RAWTORGB24ROW_MSA) -ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) -#endif -#if defined(HAS_RAWTORGB24ROW_MMI) -ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) -#endif -#ifdef HAS_ARGBTOYROW_AVX2 -ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) -#endif -#ifdef HAS_ABGRTOYROW_AVX2 -ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) -#endif -#ifdef HAS_ARGBTOYJROW_AVX2 -ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) -#endif -#ifdef HAS_RGBATOYJROW_AVX2 -ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) -#endif -#ifdef HAS_UYVYTOYROW_AVX2 -ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) -#endif -#ifdef HAS_YUY2TOYROW_AVX2 -ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) -#endif -#ifdef HAS_ARGBTOYROW_SSSE3 -ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) -#endif -#ifdef HAS_BGRATOYROW_SSSE3 -ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) -ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) -ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) -ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) -ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYJROW_SSSE3 -ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) -#endif -#ifdef HAS_RGBATOYJROW_SSSE3 -ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYROW_NEON -ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_ARGBTOYROW_MSA -ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYROW_MMI -ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) -#endif -#ifdef HAS_ARGBTOYJROW_NEON -ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_RGBATOYJROW_NEON -ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_ARGBTOYJROW_MSA -ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYJROW_MMI -ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7) -#endif -#ifdef HAS_BGRATOYROW_NEON -ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_BGRATOYROW_MSA -ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) -#endif -#ifdef HAS_BGRATOYROW_MMI -ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7) -#endif -#ifdef HAS_ABGRTOYROW_NEON -ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_ABGRTOYROW_MSA -ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) -#endif -#ifdef HAS_ABGRTOYROW_MMI -ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7) -#endif -#ifdef HAS_RGBATOYROW_NEON -ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) -#endif -#ifdef HAS_RGBATOYROW_MSA -ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) -#endif -#ifdef HAS_RGBATOYROW_MMI -ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) -#endif -#ifdef HAS_RGB24TOYROW_NEON -ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) -#endif -#ifdef HAS_RGB24TOYJROW_AVX2 -ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) -#endif -#ifdef HAS_RGB24TOYJROW_SSSE3 -ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) -#endif -#ifdef HAS_RGB24TOYJROW_NEON -ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7) -#endif -#ifdef HAS_RGB24TOYROW_MSA -ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) -#endif -#ifdef HAS_RGB24TOYROW_MMI -ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) -#endif -#ifdef HAS_RAWTOYROW_NEON -ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) -#endif -#ifdef HAS_RAWTOYJROW_AVX2 -ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) -#endif -#ifdef HAS_RAWTOYJROW_SSSE3 -ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) -#endif -#ifdef HAS_RAWTOYJROW_NEON -ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7) -#endif -#ifdef HAS_RAWTOYROW_MSA -ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) -#endif -#ifdef HAS_RAWTOYROW_MMI -ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7) -#endif -#ifdef HAS_RGB565TOYROW_NEON -ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) -#endif -#ifdef HAS_RGB565TOYROW_MSA -ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) -#endif -#ifdef HAS_RGB565TOYROW_MMI -ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7) -#endif -#ifdef HAS_ARGB1555TOYROW_NEON -ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) -#endif -#ifdef HAS_ARGB1555TOYROW_MSA -ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) -#endif -#ifdef HAS_ARGB1555TOYROW_MMI -ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7) -#endif -#ifdef HAS_ARGB4444TOYROW_NEON -ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) -#endif -#ifdef HAS_ARGB4444TOYROW_MMI -ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7) -#endif -#ifdef HAS_YUY2TOYROW_NEON -ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) -#endif -#ifdef HAS_UYVYTOYROW_NEON -ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) -#endif -#ifdef HAS_YUY2TOYROW_MSA -ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) -#endif -#ifdef HAS_YUY2TOYROW_MMI -ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) -#endif -#ifdef HAS_UYVYTOYROW_MSA -ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) -#endif -#ifdef HAS_UYVYTOYROW_MMI -ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) -#endif -#ifdef HAS_AYUVTOYROW_NEON -ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) -#endif -#ifdef HAS_SWAPUVROW_SSSE3 -ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) -#endif -#ifdef HAS_SWAPUVROW_AVX2 -ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) -#endif -#ifdef HAS_SWAPUVROW_NEON -ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) -#endif -#ifdef HAS_RGB24TOARGBROW_NEON -ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) -#endif -#ifdef HAS_RGB24TOARGBROW_MSA -ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) -#endif -#ifdef HAS_RGB24TOARGBROW_MMI -ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3) -#endif -#ifdef HAS_RAWTOARGBROW_NEON -ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) -#endif -#ifdef HAS_RAWTORGBAROW_NEON -ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7) -#endif -#ifdef HAS_RAWTOARGBROW_MSA -ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) -#endif -#ifdef HAS_RAWTOARGBROW_MMI -ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3) -#endif -#ifdef HAS_RGB565TOARGBROW_NEON -ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) -#endif -#ifdef HAS_RGB565TOARGBROW_MSA -ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) -#endif -#ifdef HAS_RGB565TOARGBROW_MMI -ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3) -#endif -#ifdef HAS_ARGB1555TOARGBROW_NEON -ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) -#endif -#ifdef HAS_ARGB1555TOARGBROW_MSA -ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) -#endif -#ifdef HAS_ARGB1555TOARGBROW_MMI -ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3) -#endif -#ifdef HAS_ARGB4444TOARGBROW_NEON -ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) -#endif -#ifdef HAS_ARGB4444TOARGBROW_MSA -ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) -#endif -#ifdef HAS_ARGB4444TOARGBROW_MMI -ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3) -#endif -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) -#endif -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) -#endif -#ifdef HAS_ARGBATTENUATEROW_AVX2 -ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) -#endif -#ifdef HAS_ARGBUNATTENUATEROW_AVX2 -ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) -#endif -#ifdef HAS_ARGBATTENUATEROW_NEON -ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) -#endif -#ifdef HAS_ARGBATTENUATEROW_MSA -ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) -#endif -#ifdef HAS_ARGBATTENUATEROW_MMI -ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) -#endif -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) -#endif -#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) -#endif -#ifdef HAS_ARGBEXTRACTALPHAROW_NEON -ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) -#endif -#ifdef HAS_ARGBEXTRACTALPHAROW_MSA -ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) -#endif -#ifdef HAS_ARGBEXTRACTALPHAROW_MMI -ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7) -#endif -#undef ANY11 - -// Any 1 to 1 blended. Destination is read, modify, write. -#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ - } - -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) -#endif -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) -#endif -#ifdef HAS_ARGBCOPYALPHAROW_MMI -ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI -ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) -#endif -#undef ANY11B - -// Any 1 to 1 with parameter. -#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, param, n); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, param, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ - } - -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11P(I400ToARGBRow_Any_SSE2, - I400ToARGBRow_SSE2, - const struct YuvConstants*, - 1, - 4, - 7) -#endif -#if defined(HAS_I400TOARGBROW_AVX2) -ANY11P(I400ToARGBRow_Any_AVX2, - I400ToARGBRow_AVX2, - const struct YuvConstants*, - 1, - 4, - 15) -#endif -#if defined(HAS_I400TOARGBROW_NEON) -ANY11P(I400ToARGBRow_Any_NEON, - I400ToARGBRow_NEON, - const struct YuvConstants*, - 1, - 4, - 7) -#endif -#if defined(HAS_I400TOARGBROW_MSA) -ANY11P(I400ToARGBRow_Any_MSA, - I400ToARGBRow_MSA, - const struct YuvConstants*, - 1, - 4, - 15) -#endif -#if defined(HAS_I400TOARGBROW_MMI) -ANY11P(I400ToARGBRow_Any_MMI, - I400ToARGBRow_MMI, - const struct YuvConstants*, - 1, - 4, - 7) -#endif - -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, - ARGBToRGB565DitherRow_SSE2, - const uint32_t, - 4, - 2, - 3) -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) -ANY11P(ARGBToRGB565DitherRow_Any_AVX2, - ARGBToRGB565DitherRow_AVX2, - const uint32_t, - 4, - 2, - 7) -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_NEON) -ANY11P(ARGBToRGB565DitherRow_Any_NEON, - ARGBToRGB565DitherRow_NEON, - const uint32_t, - 4, - 2, - 7) -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MSA) -ANY11P(ARGBToRGB565DitherRow_Any_MSA, - ARGBToRGB565DitherRow_MSA, - const uint32_t, - 4, - 2, - 7) -#endif -#if defined(HAS_ARGBTORGB565DITHERROW_MMI) -ANY11P(ARGBToRGB565DitherRow_Any_MMI, - ARGBToRGB565DitherRow_MMI, - const uint32_t, - 4, - 2, - 3) -#endif -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) -#endif -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) -#endif -#ifdef HAS_ARGBSHUFFLEROW_NEON -ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) -#endif -#ifdef HAS_ARGBSHUFFLEROW_MSA -ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) -#endif -#ifdef HAS_ARGBSHUFFLEROW_MMI -ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) -#endif -#undef ANY11P -#undef ANY11P - -// Any 1 to 1 with type -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ - void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ - SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ - memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ - ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ - memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ - } - -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif - -#ifdef HAS_ARGBTOAB64ROW_SSSE3 -ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif - -#ifdef HAS_AR64TOARGBROW_SSSE3 -ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif - -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif - -#ifdef HAS_ARGBTOAR64ROW_AVX2 -ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) -#endif - -#ifdef HAS_ARGBTOAB64ROW_AVX2 -ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) -#endif - -#ifdef HAS_AR64TOARGBROW_AVX2 -ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) -#endif - -#ifdef HAS_ARGBTOAR64ROW_AVX2 -ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) -#endif - -#ifdef HAS_ARGBTOAR64ROW_NEON -ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) -#endif - -#ifdef HAS_ARGBTOAB64ROW_NEON -ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) -#endif - -#ifdef HAS_AR64TOARGBROW_NEON -ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) -#endif - -#ifdef HAS_ARGBTOAR64ROW_NEON -ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) -#endif - -#undef ANY11T - -// Any 1 to 1 with parameter and shorts. BPP measures in shorts. -#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ - void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ - SIMD_ALIGNED(STYPE temp[32]); \ - SIMD_ALIGNED(DTYPE out[32]); \ - memset(temp, 0, 32 * SBPP); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, scale, n); \ - } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, out, scale, MASK + 1); \ - memcpy(dst_ptr + n, out, r * BPP); \ - } - -#ifdef HAS_CONVERT16TO8ROW_SSSE3 -ANY11C(Convert16To8Row_Any_SSSE3, - Convert16To8Row_SSSE3, - 2, - 1, - uint16_t, - uint8_t, - 15) -#endif -#ifdef HAS_CONVERT16TO8ROW_AVX2 -ANY11C(Convert16To8Row_Any_AVX2, - Convert16To8Row_AVX2, - 2, - 1, - uint16_t, - uint8_t, - 31) -#endif -#ifdef HAS_CONVERT8TO16ROW_SSE2 -ANY11C(Convert8To16Row_Any_SSE2, - Convert8To16Row_SSE2, - 1, - 2, - uint8_t, - uint16_t, - 15) -#endif -#ifdef HAS_CONVERT8TO16ROW_AVX2 -ANY11C(Convert8To16Row_Any_AVX2, - Convert8To16Row_AVX2, - 1, - 2, - uint8_t, - uint16_t, - 31) -#endif -#ifdef HAS_MULTIPLYROW_16_AVX2 -ANY11C(MultiplyRow_16_Any_AVX2, - MultiplyRow_16_AVX2, - 2, - 2, - uint16_t, - uint16_t, - 31) -#endif -#ifdef HAS_MULTIPLYROW_16_NEON -ANY11C(MultiplyRow_16_Any_NEON, - MultiplyRow_16_NEON, - 2, - 2, - uint16_t, - uint16_t, - 15) -#endif -#ifdef HAS_DIVIDEROW_16_AVX2 -ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) -#endif -#ifdef HAS_DIVIDEROW_16_NEON -ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) -#endif -#undef ANY11C - -// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. -#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ - void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ - SIMD_ALIGNED(ST temp[32]); \ - SIMD_ALIGNED(T out[32]); \ - memset(temp, 0, SBPP * 32); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, param, n); \ - } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, out, param, MASK + 1); \ - memcpy(dst_ptr + n, out, r * BPP); \ - } - -#ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) -#endif -#ifdef HAS_HALFFLOATROW_AVX2 -ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) -#endif -#ifdef HAS_HALFFLOATROW_F16C -ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) -ANY11P16(HalfFloat1Row_Any_F16C, - HalfFloat1Row_F16C, - uint16_t, - uint16_t, - 2, - 2, - 15) -#endif -#ifdef HAS_HALFFLOATROW_NEON -ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) -ANY11P16(HalfFloat1Row_Any_NEON, - HalfFloat1Row_NEON, - uint16_t, - uint16_t, - 2, - 2, - 7) -#endif -#ifdef HAS_HALFFLOATROW_MSA -ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) -#endif -#ifdef HAS_BYTETOFLOATROW_NEON -ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) -#endif -#undef ANY11P16 - -// Any 1 to 1 with yuvconstants -#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } -#if defined(HAS_YUY2TOARGBROW_SSSE3) -ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) -ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) -#endif -#if defined(HAS_YUY2TOARGBROW_AVX2) -ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) -ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) -#endif -#if defined(HAS_YUY2TOARGBROW_NEON) -ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) -ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) -#endif -#if defined(HAS_YUY2TOARGBROW_MSA) -ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) -ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) -#endif -#if defined(HAS_YUY2TOARGBROW_MMI) -ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7) -ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) -#endif -#undef ANY11C - -// Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \ - int width, int source_y_fraction) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } - -#ifdef HAS_INTERPOLATEROW_AVX2 -ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) -#endif -#ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) -#endif -#ifdef HAS_INTERPOLATEROW_NEON -ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) -#endif -#ifdef HAS_INTERPOLATEROW_MSA -ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) -#endif -#ifdef HAS_INTERPOLATEROW_MMI -ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) -#endif -#undef ANY11I - -// Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr, r* BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ - } - -#ifdef HAS_MIRRORROW_AVX2 -ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) -#endif -#ifdef HAS_MIRRORROW_SSSE3 -ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) -#endif -#ifdef HAS_MIRRORROW_NEON -ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) -#endif -#ifdef HAS_MIRRORROW_MSA -ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) -#endif -#ifdef HAS_MIRRORROW_MMI -ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) -#endif -#ifdef HAS_MIRRORUVROW_AVX2 -ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) -#endif -#ifdef HAS_MIRRORUVROW_SSSE3 -ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) -#endif -#ifdef HAS_MIRRORUVROW_NEON -ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) -#endif -#ifdef HAS_MIRRORUVROW_MSA -ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) -#endif -#ifdef HAS_ARGBMIRRORROW_AVX2 -ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) -#endif -#ifdef HAS_ARGBMIRRORROW_SSE2 -ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) -#endif -#ifdef HAS_ARGBMIRRORROW_NEON -ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) -#endif -#ifdef HAS_ARGBMIRRORROW_MSA -ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) -#endif -#ifdef HAS_ARGBMIRRORROW_MMI -ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) -#endif -#ifdef HAS_RGB24MIRRORROW_SSSE3 -ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) -#endif -#ifdef HAS_RGB24MIRRORROW_NEON -ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) -#endif -#undef ANY11M - -// Any 1 plane. (memset) -#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8_t temp[64]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, v32, n); \ - } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ - } - -#ifdef HAS_SETROW_X86 -ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) -#endif -#ifdef HAS_SETROW_NEON -ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) -#endif -#ifdef HAS_ARGBSETROW_NEON -ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) -#endif -#ifdef HAS_ARGBSETROW_MSA -ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) -#endif -#ifdef HAS_ARGBSETROW_MMI -ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3) -#endif -#undef ANY1 - -// Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ - } - -#ifdef HAS_SPLITUVROW_SSE2 -ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) -#endif -#ifdef HAS_SPLITUVROW_AVX2 -ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) -#endif -#ifdef HAS_SPLITUVROW_NEON -ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) -#endif -#ifdef HAS_SPLITUVROW_MSA -ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) -#endif -#ifdef HAS_SPLITUVROW_MMI -ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7) -#endif -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) -#endif -#ifdef HAS_YUY2TOUV422ROW_AVX2 -ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) -ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) -#endif -#ifdef HAS_YUY2TOUV422ROW_SSE2 -ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) -ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) -#endif -#ifdef HAS_YUY2TOUV422ROW_NEON -ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) -ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) -ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) -#endif -#ifdef HAS_YUY2TOUV422ROW_MSA -ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) -ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) -ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) -#endif -#ifdef HAS_YUY2TOUV422ROW_MMI -ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) -ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) -ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) -#endif -#undef ANY12 - -// Any 2 16 bit planes with parameter to 1 -#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ - } \ - memcpy(temp, src_uv + n * 2, r * BPP * 2); \ - ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \ - memcpy(dst_u + n, temp + 32, r * BPP); \ - memcpy(dst_v + n, temp + 48, r * BPP); \ - } - -#ifdef HAS_SPLITUVROW_16_AVX2 -ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15) -#endif - -#ifdef HAS_SPLITUVROW_16_NEON -ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) -#endif - -#undef ANY21CT - -// Any 1 to 3. Outputs RGB planes. -#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ - uint8_t* dst_b, int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 6]); \ - memset(temp, 0, 16 * 3); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ - } \ - memcpy(temp, src_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ - memcpy(dst_r + n, temp + 16 * 3, r); \ - memcpy(dst_g + n, temp + 16 * 4, r); \ - memcpy(dst_b + n, temp + 16 * 5, r); \ - } - -#ifdef HAS_SPLITRGBROW_SSSE3 -ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) -#endif -#ifdef HAS_SPLITRGBROW_NEON -ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) -#endif -#ifdef HAS_SPLITRGBROW_MMI -ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) -#endif -#ifdef HAS_SPLITXRGBROW_SSE2 -ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITXRGBROW_SSSE3 -ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) -#endif -#ifdef HAS_SPLITXRGBROW_AVX2 -ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) -#endif -#ifdef HAS_SPLITXRGBROW_NEON -ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) -#endif - -// Any 1 to 4. Outputs ARGB planes. -#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ - uint8_t* dst_b, uint8_t* dst_a, int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 8]); \ - memset(temp, 0, 16 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ - } \ - memcpy(temp, src_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \ - MASK + 1); \ - memcpy(dst_r + n, temp + 16 * 4, r); \ - memcpy(dst_g + n, temp + 16 * 5, r); \ - memcpy(dst_b + n, temp + 16 * 6, r); \ - memcpy(dst_a + n, temp + 16 * 7, r); \ - } - -#ifdef HAS_SPLITARGBROW_SSE2 -ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITARGBROW_SSSE3 -ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) -#endif -#ifdef HAS_SPLITARGBROW_AVX2 -ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) -#endif -#ifdef HAS_SPLITARGBROW_NEON -ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) -#endif - -// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. -// 128 byte row allows for 32 avx ARGB pixels. -#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ - uint8_t* dst_v, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 4]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ - memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ - BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ - memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ - } - -#ifdef HAS_ARGBTOUVROW_AVX2 -ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) -#endif -#ifdef HAS_ABGRTOUVROW_AVX2 -ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) -#endif -#ifdef HAS_ARGBTOUVJROW_AVX2 -ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) -ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) -ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) -ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) -ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) -#endif -#ifdef HAS_YUY2TOUVROW_AVX2 -ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) -ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) -#endif -#ifdef HAS_YUY2TOUVROW_SSE2 -ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) -ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) -#endif -#ifdef HAS_ARGBTOUVROW_NEON -ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVROW_MSA -ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) -#endif -#ifdef HAS_ARGBTOUVROW_MMI -ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVJROW_NEON -ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVJROW_MSA -ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) -#endif -#ifdef HAS_ARGBTOUVJROW_MMI -ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) -#endif -#ifdef HAS_BGRATOUVROW_NEON -ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_BGRATOUVROW_MSA -ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) -#endif -#ifdef HAS_BGRATOUVROW_MMI -ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVROW_NEON -ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVROW_MSA -ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVROW_MMI -ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) -#endif -#ifdef HAS_RGBATOUVROW_NEON -ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_RGBATOUVROW_MSA -ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) -#endif -#ifdef HAS_RGBATOUVROW_MMI -ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) -#endif -#ifdef HAS_RGB24TOUVROW_NEON -ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) -#endif -#ifdef HAS_RGB24TOUVROW_MSA -ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) -#endif -#ifdef HAS_RGB24TOUVROW_MMI -ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15) -#endif -#ifdef HAS_RAWTOUVROW_NEON -ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) -#endif -#ifdef HAS_RAWTOUVROW_MSA -ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) -#endif -#ifdef HAS_RAWTOUVROW_MMI -ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15) -#endif -#ifdef HAS_RGB565TOUVROW_NEON -ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) -#endif -#ifdef HAS_RGB565TOUVROW_MSA -ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) -#endif -#ifdef HAS_RGB565TOUVROW_MMI -ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15) -#endif -#ifdef HAS_ARGB1555TOUVROW_NEON -ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) -#endif -#ifdef HAS_ARGB1555TOUVROW_MSA -ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) -#endif -#ifdef HAS_ARGB1555TOUVROW_MMI -ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15) -#endif -#ifdef HAS_ARGB4444TOUVROW_NEON -ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) -#endif -#ifdef HAS_ARGB4444TOUVROW_MMI -ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15) -#endif -#ifdef HAS_YUY2TOUVROW_NEON -ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) -#endif -#ifdef HAS_UYVYTOUVROW_NEON -ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) -#endif -#ifdef HAS_YUY2TOUVROW_MSA -ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) -#endif -#ifdef HAS_YUY2TOUVROW_MMI -ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) -#endif -#ifdef HAS_UYVYTOUVROW_MSA -ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) -#endif -#ifdef HAS_UYVYTOUVROW_MMI -ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) -#endif -#undef ANY12S - -// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. -// 128 byte row allows for 32 avx ARGB pixels. -#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ - memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ - BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, 128, temp + 256, MASK + 1); \ - memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \ - } - -#ifdef HAS_AYUVTOVUROW_NEON -ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) -ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) -#endif -#undef ANY11S - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_common.cc b/thirdparty/libyuv/source/row_common.cc deleted file mode 100644 index 4d0dce2..0000000 --- a/thirdparty/libyuv/source/row_common.cc +++ /dev/null @@ -1,4212 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#include -#include -#include // For memcpy and memset. - -#include "libyuv/basic_types.h" -#include "libyuv/convert_argb.h" // For kYuvI601Constants - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This macro control YUV to RGB using unsigned math to extend range of -// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: -// LIBYUV_UNLIMITED_DATA - -// The following macro from row_win makes the C code match the row_win code, -// which is 7 bit fixed point for ARGBToI420: -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) -#define LIBYUV_RGB7 1 -#endif - -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ - defined(_M_IX86) -#define LIBYUV_ARGBTOUV_PAVGB 1 -#define LIBYUV_RGBTOU_TRUNCATE 1 -#endif - -// llvm x86 is poor at ternary operator, so use branchless min/max. - -#define USE_BRANCHLESS 1 -#if USE_BRANCHLESS -static __inline int32_t clamp0(int32_t v) { - return -(v >= 0) & v; -} -// TODO(fbarchard): make clamp255 preserve negative values. -static __inline int32_t clamp255(int32_t v) { - return (-(v >= 255) | v) & 255; -} - -static __inline int32_t clamp1023(int32_t v) { - return (-(v >= 1023) | v) & 1023; -} - -// clamp to max -static __inline int32_t ClampMax(int32_t v, int32_t max) { - return (-(v >= max) | v) & max; -} - -static __inline uint32_t Abs(int32_t v) { - int m = -(v < 0); - return (v + m) ^ m; -} -#else // USE_BRANCHLESS -static __inline int32_t clamp0(int32_t v) { - return (v < 0) ? 0 : v; -} - -static __inline int32_t clamp255(int32_t v) { - return (v > 255) ? 255 : v; -} - -static __inline int32_t clamp1023(int32_t v) { - return (v > 1023) ? 1023 : v; -} - -static __inline int32_t ClampMax(int32_t v, int32_t max) { - return (v > max) ? max : v; -} - -static __inline uint32_t Abs(int32_t v) { - return (v < 0) ? -v : v; -} -#endif // USE_BRANCHLESS -static __inline uint32_t Clamp(int32_t val) { - int v = clamp0(val); - return (uint32_t)(clamp255(v)); -} - -static __inline uint32_t Clamp10(int32_t val) { - int v = clamp0(val); - return (uint32_t)(clamp1023(v)); -} - -// Little Endian -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ - defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define WRITEWORD(p, v) *(uint32_t*)(p) = v -#else -static inline void WRITEWORD(uint8_t* p, uint32_t v) { - p[0] = (uint8_t)(v & 255); - p[1] = (uint8_t)((v >> 8) & 255); - p[2] = (uint8_t)((v >> 16) & 255); - p[3] = (uint8_t)((v >> 24) & 255); -} -#endif - -void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_rgb24[0]; - uint8_t g = src_rgb24[1]; - uint8_t r = src_rgb24[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb += 4; - src_rgb24 += 3; - } -} - -void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t r = src_raw[0]; - uint8_t g = src_raw[1]; - uint8_t b = src_raw[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb += 4; - src_raw += 3; - } -} - -void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t r = src_raw[0]; - uint8_t g = src_raw[1]; - uint8_t b = src_raw[2]; - dst_rgba[0] = 255u; - dst_rgba[1] = b; - dst_rgba[2] = g; - dst_rgba[3] = r; - dst_rgba += 4; - src_raw += 3; - } -} - -void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t r = src_raw[0]; - uint8_t g = src_raw[1]; - uint8_t b = src_raw[2]; - dst_rgb24[0] = b; - dst_rgb24[1] = g; - dst_rgb24[2] = r; - dst_rgb24 += 3; - src_raw += 3; - } -} - -void RGB565ToARGBRow_C(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_rgb565[0] & 0x1f; - uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r = src_rgb565[1] >> 3; - dst_argb[0] = (b << 3) | (b >> 2); - dst_argb[1] = (g << 2) | (g >> 4); - dst_argb[2] = (r << 3) | (r >> 2); - dst_argb[3] = 255u; - dst_argb += 4; - src_rgb565 += 2; - } -} - -void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb1555[0] & 0x1f; - uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r = (src_argb1555[1] & 0x7c) >> 2; - uint8_t a = src_argb1555[1] >> 7; - dst_argb[0] = (b << 3) | (b >> 2); - dst_argb[1] = (g << 3) | (g >> 2); - dst_argb[2] = (r << 3) | (r >> 2); - dst_argb[3] = -a; - dst_argb += 4; - src_argb1555 += 2; - } -} - -void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb4444[0] & 0x0f; - uint8_t g = src_argb4444[0] >> 4; - uint8_t r = src_argb4444[1] & 0x0f; - uint8_t a = src_argb4444[1] >> 4; - dst_argb[0] = (b << 4) | b; - dst_argb[1] = (g << 4) | g; - dst_argb[2] = (r << 4) | r; - dst_argb[3] = (a << 4) | a; - dst_argb += 4; - src_argb4444 += 2; - } -} - -void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint32_t ar30; - memcpy(&ar30, src_ar30, sizeof ar30); - uint32_t b = (ar30 >> 2) & 0xff; - uint32_t g = (ar30 >> 12) & 0xff; - uint32_t r = (ar30 >> 22) & 0xff; - uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. - *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); - dst_argb += 4; - src_ar30 += 4; - } -} - -void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { - int x; - for (x = 0; x < width; ++x) { - uint32_t ar30; - memcpy(&ar30, src_ar30, sizeof ar30); - uint32_t b = (ar30 >> 2) & 0xff; - uint32_t g = (ar30 >> 12) & 0xff; - uint32_t r = (ar30 >> 22) & 0xff; - uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. - *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); - dst_abgr += 4; - src_ar30 += 4; - } -} - -void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { - int x; - for (x = 0; x < width; ++x) { - uint32_t ar30; - memcpy(&ar30, src_ar30, sizeof ar30); - uint32_t b = ar30 & 0x3ff; - uint32_t ga = ar30 & 0xc00ffc00; - uint32_t r = (ar30 >> 20) & 0x3ff; - *(uint32_t*)(dst_ab30) = r | ga | (b << 20); - dst_ab30 += 4; - src_ar30 += 4; - } -} - -void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb[0]; - uint8_t g = src_argb[1]; - uint8_t r = src_argb[2]; - dst_rgb[0] = b; - dst_rgb[1] = g; - dst_rgb[2] = r; - dst_rgb += 3; - src_argb += 4; - } -} - -void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb[0]; - uint8_t g = src_argb[1]; - uint8_t r = src_argb[2]; - dst_rgb[0] = r; - dst_rgb[1] = g; - dst_rgb[2] = b; - dst_rgb += 3; - src_argb += 4; - } -} - -void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb[0] >> 3; - uint8_t g0 = src_argb[1] >> 2; - uint8_t r0 = src_argb[2] >> 3; - uint8_t b1 = src_argb[4] >> 3; - uint8_t g1 = src_argb[5] >> 2; - uint8_t r1 = src_argb[6] >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | - (r1 << 27)); - dst_rgb += 4; - src_argb += 8; - } - if (width & 1) { - uint8_t b0 = src_argb[0] >> 3; - uint8_t g0 = src_argb[1] >> 2; - uint8_t r0 = src_argb[2] >> 3; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); - } -} - -// dither4 is a row of 4 values from 4x4 dither matrix. -// The 4x4 matrix contains values to increase RGB. When converting to -// fewer bits (565) this provides an ordered dither. -// The order in the 4x4 matrix in first byte is upper left. -// The 4 values are passed as an int, then referenced as an array, so -// endian will not affect order of the original matrix. But the dither4 -// will containing the first pixel in the lower byte for little endian -// or the upper byte for big endian. -void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - int dither0 = ((const unsigned char*)(&dither4))[x & 3]; - int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; - uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; - uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; - uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; - uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | - (r1 << 27)); - dst_rgb += 4; - src_argb += 8; - } - if (width & 1) { - int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; - uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); - } -} - -void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb[0] >> 3; - uint8_t g0 = src_argb[1] >> 3; - uint8_t r0 = src_argb[2] >> 3; - uint8_t a0 = src_argb[3] >> 7; - uint8_t b1 = src_argb[4] >> 3; - uint8_t g1 = src_argb[5] >> 3; - uint8_t r1 = src_argb[6] >> 3; - uint8_t a1 = src_argb[7] >> 7; - *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); - dst_rgb += 4; - src_argb += 8; - } - if (width & 1) { - uint8_t b0 = src_argb[0] >> 3; - uint8_t g0 = src_argb[1] >> 3; - uint8_t r0 = src_argb[2] >> 3; - uint8_t a0 = src_argb[3] >> 7; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); - } -} - -void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb[0] >> 4; - uint8_t g0 = src_argb[1] >> 4; - uint8_t r0 = src_argb[2] >> 4; - uint8_t a0 = src_argb[3] >> 4; - uint8_t b1 = src_argb[4] >> 4; - uint8_t g1 = src_argb[5] >> 4; - uint8_t r1 = src_argb[6] >> 4; - uint8_t a1 = src_argb[7] >> 4; - *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); - dst_rgb += 4; - src_argb += 8; - } - if (width & 1) { - uint8_t b0 = src_argb[0] >> 4; - uint8_t g0 = src_argb[1] >> 4; - uint8_t r0 = src_argb[2] >> 4; - uint8_t a0 = src_argb[3] >> 4; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); - } -} - -void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { - int x; - for (x = 0; x < width; ++x) { - uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); - uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); - uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); - uint32_t a0 = (src_abgr[3] >> 6); - *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); - dst_ar30 += 4; - src_abgr += 4; - } -} - -void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { - int x; - for (x = 0; x < width; ++x) { - uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); - uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); - uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); - uint32_t a0 = (src_argb[3] >> 6); - *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); - dst_ar30 += 4; - src_argb += 4; - } -} - -void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - int x; - for (x = 0; x < width; ++x) { - dst_ar64[0] = src_argb[0] * 0x0101; - dst_ar64[1] = src_argb[1] * 0x0101; - dst_ar64[2] = src_argb[2] * 0x0101; - dst_ar64[3] = src_argb[3] * 0x0101; - dst_ar64 += 4; - src_argb += 4; - } -} - -void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - int x; - for (x = 0; x < width; ++x) { - dst_ab64[0] = src_argb[2] * 0x0101; - dst_ab64[1] = src_argb[1] * 0x0101; - dst_ab64[2] = src_argb[0] * 0x0101; - dst_ab64[3] = src_argb[3] * 0x0101; - dst_ab64 += 4; - src_argb += 4; - } -} - -void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - dst_argb[0] = src_ar64[0] >> 8; - dst_argb[1] = src_ar64[1] >> 8; - dst_argb[2] = src_ar64[2] >> 8; - dst_argb[3] = src_ar64[3] >> 8; - dst_argb += 4; - src_ar64 += 4; - } -} - -void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - dst_argb[0] = src_ab64[2] >> 8; - dst_argb[1] = src_ab64[1] >> 8; - dst_argb[2] = src_ab64[0] >> 8; - dst_argb[3] = src_ab64[3] >> 8; - dst_argb += 4; - src_ab64 += 4; - } -} - -// TODO(fbarchard): Make shuffle compatible with SIMD versions -void AR64ShuffleRow_C(const uint8_t* src_ar64, - uint8_t* dst_ar64, - const uint8_t* shuffler, - int width) { - const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; - uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; - int index0 = shuffler[0] / 2; - int index1 = shuffler[2] / 2; - int index2 = shuffler[4] / 2; - int index3 = shuffler[6] / 2; - // Shuffle a row of AR64. - int x; - for (x = 0; x < width / 2; ++x) { - // To support in-place conversion. - uint16_t b = src_ar64_16[index0]; - uint16_t g = src_ar64_16[index1]; - uint16_t r = src_ar64_16[index2]; - uint16_t a = src_ar64_16[index3]; - dst_ar64_16[0] = b; - dst_ar64_16[1] = g; - dst_ar64_16[2] = r; - dst_ar64_16[3] = a; - src_ar64_16 += 4; - dst_ar64_16 += 4; - } -} - -#ifdef LIBYUV_RGB7 -// Old 7 bit math for compatibility on unsupported platforms. -static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return ((33 * r + 65 * g + 13 * b) >> 7) + 16; -} -#else -// 8 bit -// Intel SSE/AVX uses the following equivalent formula -// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. -// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + -// 0x7e80) >> 8; - -static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; -} -#endif - -#define AVGB(a, b) (((a) + (b) + 1) >> 1) - -#ifdef LIBYUV_RGBTOU_TRUNCATE -static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; -} -static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; -} -#else -// TODO(fbarchard): Add rounding to SIMD and use this -static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; -} -static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; -} -#endif - -#if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { - return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; -} -static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { - return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; -} -#endif - -// ARGBToY_C and ARGBToUV_C -// Intel version mimic SSE/AVX which does 2 pavgb -#if LIBYUV_ARGBTOUV_PAVGB - -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ - } -#else -// ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = src_rgb[B] + src_rgb1[B]; \ - uint16_t ag = src_rgb[G] + src_rgb1[G]; \ - uint16_t ar = src_rgb[R] + src_rgb1[R]; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - } \ - } -#endif - -MAKEROWY(ARGB, 2, 1, 0, 4) -MAKEROWY(BGRA, 1, 2, 3, 4) -MAKEROWY(ABGR, 0, 1, 2, 4) -MAKEROWY(RGBA, 3, 2, 1, 4) -MAKEROWY(RGB24, 2, 1, 0, 3) -MAKEROWY(RAW, 0, 1, 2, 3) -#undef MAKEROWY - -// JPeg uses a variation on BT.601-1 full range -// y = 0.29900 * r + 0.58700 * g + 0.11400 * b -// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center -// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center -// BT.601 Mpeg range uses: -// b 0.1016 * 255 = 25.908 = 25 -// g 0.5078 * 255 = 129.489 = 129 -// r 0.2578 * 255 = 65.739 = 66 -// JPeg 7 bit Y (deprecated) -// b 0.11400 * 128 = 14.592 = 15 -// g 0.58700 * 128 = 75.136 = 75 -// r 0.29900 * 128 = 38.272 = 38 -// JPeg 8 bit Y: -// b 0.11400 * 256 = 29.184 = 29 -// g 0.58700 * 256 = 150.272 = 150 -// r 0.29900 * 256 = 76.544 = 77 -// JPeg 8 bit U: -// b 0.50000 * 255 = 127.5 = 127 -// g -0.33126 * 255 = -84.4713 = -84 -// r -0.16874 * 255 = -43.0287 = -43 -// JPeg 8 bit V: -// b -0.08131 * 255 = -20.73405 = -20 -// g -0.41869 * 255 = -106.76595 = -107 -// r 0.50000 * 255 = 127.5 = 127 - -#ifdef LIBYUV_RGB7 -// Old 7 bit math for compatibility on unsupported platforms. -static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { - return (38 * r + 75 * g + 15 * b + 64) >> 7; -} -#else -// 8 bit -static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { - return (77 * r + 150 * g + 29 * b + 128) >> 8; -} -#endif - -#if defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; -} -static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; -} -#else -static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; -} -static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; -} -#endif - -// ARGBToYJ_C and ARGBToUVJ_C -// Intel version mimic SSE/AVX which does 2 pavgb -#if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ - } -#else -// ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ - uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ - uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - } \ - } - -#endif - -MAKEROWYJ(ARGB, 2, 1, 0, 4) -MAKEROWYJ(RGBA, 3, 2, 1, 4) -MAKEROWYJ(RGB24, 2, 1, 0, 3) -MAKEROWYJ(RAW, 0, 1, 2, 3) -#undef MAKEROWYJ - -void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_rgb565[0] & 0x1f; - uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r = src_rgb565[1] >> 3; - b = (b << 3) | (b >> 2); - g = (g << 2) | (g >> 4); - r = (r << 3) | (r >> 2); - dst_y[0] = RGBToY(r, g, b); - src_rgb565 += 2; - dst_y += 1; - } -} - -void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb1555[0] & 0x1f; - uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r = (src_argb1555[1] & 0x7c) >> 2; - b = (b << 3) | (b >> 2); - g = (g << 3) | (g >> 2); - r = (r << 3) | (r >> 2); - dst_y[0] = RGBToY(r, g, b); - src_argb1555 += 2; - dst_y += 1; - } -} - -void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_argb4444[0] & 0x0f; - uint8_t g = src_argb4444[0] >> 4; - uint8_t r = src_argb4444[1] & 0x0f; - b = (b << 4) | b; - g = (g << 4) | g; - r = (r << 4) | r; - dst_y[0] = RGBToY(r, g, b); - src_argb4444 += 2; - dst_y += 1; - } -} - -void RGB565ToUVRow_C(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_rgb565[0] & 0x1f; - uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r0 = src_rgb565[1] >> 3; - uint8_t b1 = src_rgb565[2] & 0x1f; - uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); - uint8_t r1 = src_rgb565[3] >> 3; - uint8_t b2 = next_rgb565[0] & 0x1f; - uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8_t r2 = next_rgb565[1] >> 3; - uint8_t b3 = next_rgb565[2] & 0x1f; - uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); - uint8_t r3 = next_rgb565[3] >> 3; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 2) | (g0 >> 4); - r0 = (r0 << 3) | (r0 >> 2); - b1 = (b1 << 3) | (b1 >> 2); - g1 = (g1 << 2) | (g1 >> 4); - r1 = (r1 << 3) | (r1 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 2) | (g2 >> 4); - r2 = (r2 << 3) | (r2 >> 2); - b3 = (b3 << 3) | (b3 >> 2); - g3 = (g3 << 2) | (g3 >> 4); - r3 = (r3 << 3) | (r3 >> 2); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - - src_rgb565 += 4; - next_rgb565 += 4; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8_t b0 = src_rgb565[0] & 0x1f; - uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r0 = src_rgb565[1] >> 3; - uint8_t b2 = next_rgb565[0] & 0x1f; - uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8_t r2 = next_rgb565[1] >> 3; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 2) | (g0 >> 4); - r0 = (r0 << 3) | (r0 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 2) | (g2 >> 4); - r2 = (r2 << 3) | (r2 >> 2); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(b0, b2); - uint8_t ag = AVGB(g0, g2); - uint8_t ar = AVGB(r0, r2); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - } -} - -void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb1555[0] & 0x1f; - uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8_t b1 = src_argb1555[2] & 0x1f; - uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); - uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; - uint8_t b2 = next_argb1555[0] & 0x1f; - uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; - uint8_t b3 = next_argb1555[2] & 0x1f; - uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); - uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 3) | (g0 >> 2); - r0 = (r0 << 3) | (r0 >> 2); - b1 = (b1 << 3) | (b1 >> 2); - g1 = (g1 << 3) | (g1 >> 2); - r1 = (r1 << 3) | (r1 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 3) | (g2 >> 2); - r2 = (r2 << 3) | (r2 >> 2); - b3 = (b3 << 3) | (b3 >> 2); - g3 = (g3 << 3) | (g3 >> 2); - r3 = (r3 << 3) | (r3 >> 2); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - - src_argb1555 += 4; - next_argb1555 += 4; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8_t b0 = src_argb1555[0] & 0x1f; - uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8_t b2 = next_argb1555[0] & 0x1f; - uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8_t r2 = next_argb1555[1] >> 3; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 3) | (g0 >> 2); - r0 = (r0 << 3) | (r0 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 3) | (g2 >> 2); - r2 = (r2 << 3) | (r2 >> 2); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(b0, b2); - uint8_t ag = AVGB(g0, g2); - uint8_t ar = AVGB(r0, r2); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - } -} - -void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb4444[0] & 0x0f; - uint8_t g0 = src_argb4444[0] >> 4; - uint8_t r0 = src_argb4444[1] & 0x0f; - uint8_t b1 = src_argb4444[2] & 0x0f; - uint8_t g1 = src_argb4444[2] >> 4; - uint8_t r1 = src_argb4444[3] & 0x0f; - uint8_t b2 = next_argb4444[0] & 0x0f; - uint8_t g2 = next_argb4444[0] >> 4; - uint8_t r2 = next_argb4444[1] & 0x0f; - uint8_t b3 = next_argb4444[2] & 0x0f; - uint8_t g3 = next_argb4444[2] >> 4; - uint8_t r3 = next_argb4444[3] & 0x0f; - - b0 = (b0 << 4) | b0; - g0 = (g0 << 4) | g0; - r0 = (r0 << 4) | r0; - b1 = (b1 << 4) | b1; - g1 = (g1 << 4) | g1; - r1 = (r1 << 4) | r1; - b2 = (b2 << 4) | b2; - g2 = (g2 << 4) | g2; - r2 = (r2 << 4) | r2; - b3 = (b3 << 4) | b3; - g3 = (g3 << 4) | g3; - r3 = (r3 << 4) | r3; - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - - src_argb4444 += 4; - next_argb4444 += 4; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8_t b0 = src_argb4444[0] & 0x0f; - uint8_t g0 = src_argb4444[0] >> 4; - uint8_t r0 = src_argb4444[1] & 0x0f; - uint8_t b2 = next_argb4444[0] & 0x0f; - uint8_t g2 = next_argb4444[0] >> 4; - uint8_t r2 = next_argb4444[1] & 0x0f; - - b0 = (b0 << 4) | b0; - g0 = (g0 << 4) | g0; - r0 = (r0 << 4) | r0; - b2 = (b2 << 4) | b2; - g2 = (g2 << 4) | g2; - r2 = (r2 << 4) | r2; - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(b0, b2); - uint8_t ag = AVGB(g0, g2); - uint8_t ar = AVGB(r0, r2); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - } -} - -void ARGBToUV444Row_C(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t ab = src_argb[0]; - uint8_t ag = src_argb[1]; - uint8_t ar = src_argb[2]; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb += 4; - dst_u += 1; - dst_v += 1; - } -} - -void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); - dst_argb[2] = dst_argb[1] = dst_argb[0] = y; - dst_argb[3] = src_argb[3]; - dst_argb += 4; - src_argb += 4; - } -} - -// Convert a row of image to Sepia tone. -void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { - int x; - for (x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - int sb = (b * 17 + g * 68 + r * 35) >> 7; - int sg = (b * 22 + g * 88 + r * 45) >> 7; - int sr = (b * 24 + g * 98 + r * 50) >> 7; - // b does not over flow. a is preserved from original. - dst_argb[0] = sb; - dst_argb[1] = clamp255(sg); - dst_argb[2] = clamp255(sr); - dst_argb += 4; - } -} - -// Apply color matrix to a row of image. Matrix is signed. -// TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - int b = src_argb[0]; - int g = src_argb[1]; - int r = src_argb[2]; - int a = src_argb[3]; - int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + - a * matrix_argb[3]) >> - 6; - int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + - a * matrix_argb[7]) >> - 6; - int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + - a * matrix_argb[11]) >> - 6; - int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + - a * matrix_argb[15]) >> - 6; - dst_argb[0] = Clamp(sb); - dst_argb[1] = Clamp(sg); - dst_argb[2] = Clamp(sr); - dst_argb[3] = Clamp(sa); - src_argb += 4; - dst_argb += 4; - } -} - -// Apply color table to a row of image. -void ARGBColorTableRow_C(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - int a = dst_argb[3]; - dst_argb[0] = table_argb[b * 4 + 0]; - dst_argb[1] = table_argb[g * 4 + 1]; - dst_argb[2] = table_argb[r * 4 + 2]; - dst_argb[3] = table_argb[a * 4 + 3]; - dst_argb += 4; - } -} - -// Apply color table to a row of image. -void RGBColorTableRow_C(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - dst_argb[0] = table_argb[b * 4 + 0]; - dst_argb[1] = table_argb[g * 4 + 1]; - dst_argb[2] = table_argb[r * 4 + 2]; - dst_argb += 4; - } -} - -void ARGBQuantizeRow_C(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - int x; - for (x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; - dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; - dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; - dst_argb += 4; - } -} - -#define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v* f >> 24 - -void ARGBShadeRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - const uint32_t b_scale = REPEAT8(value & 0xff); - const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); - const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); - const uint32_t a_scale = REPEAT8(value >> 24); - - int i; - for (i = 0; i < width; ++i) { - const uint32_t b = REPEAT8(src_argb[0]); - const uint32_t g = REPEAT8(src_argb[1]); - const uint32_t r = REPEAT8(src_argb[2]); - const uint32_t a = REPEAT8(src_argb[3]); - dst_argb[0] = SHADE(b, b_scale); - dst_argb[1] = SHADE(g, g_scale); - dst_argb[2] = SHADE(r, r_scale); - dst_argb[3] = SHADE(a, a_scale); - src_argb += 4; - dst_argb += 4; - } -} -#undef REPEAT8 -#undef SHADE - -#define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v* f >> 16 - -void ARGBMultiplyRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - const uint32_t b = REPEAT8(src_argb[0]); - const uint32_t g = REPEAT8(src_argb[1]); - const uint32_t r = REPEAT8(src_argb[2]); - const uint32_t a = REPEAT8(src_argb[3]); - const uint32_t b_scale = src_argb1[0]; - const uint32_t g_scale = src_argb1[1]; - const uint32_t r_scale = src_argb1[2]; - const uint32_t a_scale = src_argb1[3]; - dst_argb[0] = SHADE(b, b_scale); - dst_argb[1] = SHADE(g, g_scale); - dst_argb[2] = SHADE(r, r_scale); - dst_argb[3] = SHADE(a, a_scale); - src_argb += 4; - src_argb1 += 4; - dst_argb += 4; - } -} -#undef REPEAT8 -#undef SHADE - -#define SHADE(f, v) clamp255(v + f) - -void ARGBAddRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - const int b = src_argb[0]; - const int g = src_argb[1]; - const int r = src_argb[2]; - const int a = src_argb[3]; - const int b_add = src_argb1[0]; - const int g_add = src_argb1[1]; - const int r_add = src_argb1[2]; - const int a_add = src_argb1[3]; - dst_argb[0] = SHADE(b, b_add); - dst_argb[1] = SHADE(g, g_add); - dst_argb[2] = SHADE(r, r_add); - dst_argb[3] = SHADE(a, a_add); - src_argb += 4; - src_argb1 += 4; - dst_argb += 4; - } -} -#undef SHADE - -#define SHADE(f, v) clamp0(f - v) - -void ARGBSubtractRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - const int b = src_argb[0]; - const int g = src_argb[1]; - const int r = src_argb[2]; - const int a = src_argb[3]; - const int b_sub = src_argb1[0]; - const int g_sub = src_argb1[1]; - const int r_sub = src_argb1[2]; - const int a_sub = src_argb1[3]; - dst_argb[0] = SHADE(b, b_sub); - dst_argb[1] = SHADE(g, g_sub); - dst_argb[2] = SHADE(r, r_sub); - dst_argb[3] = SHADE(a, a_sub); - src_argb += 4; - src_argb1 += 4; - dst_argb += 4; - } -} -#undef SHADE - -// Sobel functions which mimics SSSE3. -void SobelXRow_C(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - int i; - for (i = 0; i < width; ++i) { - int a = src_y0[i]; - int b = src_y1[i]; - int c = src_y2[i]; - int a_sub = src_y0[i + 2]; - int b_sub = src_y1[i + 2]; - int c_sub = src_y2[i + 2]; - int a_diff = a - a_sub; - int b_diff = b - b_sub; - int c_diff = c - c_sub; - int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobelx[i] = (uint8_t)(clamp255(sobel)); - } -} - -void SobelYRow_C(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - int i; - for (i = 0; i < width; ++i) { - int a = src_y0[i + 0]; - int b = src_y0[i + 1]; - int c = src_y0[i + 2]; - int a_sub = src_y1[i + 0]; - int b_sub = src_y1[i + 1]; - int c_sub = src_y1[i + 2]; - int a_diff = a - a_sub; - int b_diff = b - b_sub; - int c_diff = c - c_sub; - int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobely[i] = (uint8_t)(clamp255(sobel)); - } -} - -void SobelRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - int r = src_sobelx[i]; - int b = src_sobely[i]; - int s = clamp255(r + b); - dst_argb[0] = (uint8_t)(s); - dst_argb[1] = (uint8_t)(s); - dst_argb[2] = (uint8_t)(s); - dst_argb[3] = (uint8_t)(255u); - dst_argb += 4; - } -} - -void SobelToPlaneRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - int i; - for (i = 0; i < width; ++i) { - int r = src_sobelx[i]; - int b = src_sobely[i]; - int s = clamp255(r + b); - dst_y[i] = (uint8_t)(s); - } -} - -void SobelXYRow_C(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - int r = src_sobelx[i]; - int b = src_sobely[i]; - int g = clamp255(r + b); - dst_argb[0] = (uint8_t)(b); - dst_argb[1] = (uint8_t)(g); - dst_argb[2] = (uint8_t)(r); - dst_argb[3] = (uint8_t)(255u); - dst_argb += 4; - } -} - -void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { - // Copy a Y to RGB. - int x; - for (x = 0; x < width; ++x) { - uint8_t y = src_y[0]; - dst_argb[2] = dst_argb[1] = dst_argb[0] = y; - dst_argb[3] = 255u; - dst_argb += 4; - ++src_y; - } -} - -// Macros to create SIMD specific yuv to rgb conversion constants. - -// clang-format off - -#if defined(__aarch64__) || defined(__arm__) -// Bias values to round, and subtract 128 from U and V. -// For B and R this is negative. For G this is positive. -#define BB (UB * 128 - YB) -#define BG (UG * 128 + VG * 128 + YB) -#define BR (VR * 128 - YB) - -#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ - {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {YG, BB, BG, BR, YB, 0, 0, 0}} -#else -#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ - {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ - {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ - {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} -#endif - -// clang-format on - -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ - YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ - YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB); - -// TODO(fbarchard): Generate SIMD structures from float matrix. - -// BT.601 limited range YUV to RGB reference -// R = (Y - 16) * 1.164 + V * 1.596 -// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 -// B = (Y - 16) * 1.164 + U * 2.018 -// KR = 0.299; KB = 0.114 - -// U and V contributions to R,G,B. -#ifdef LIBYUV_UNLIMITED_DATA -#define UB 129 /* round(2.018 * 64) */ -#else -#define UB 128 /* max(128, round(2.018 * 64)) */ -#endif -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ -#define VR 102 /* round(1.596 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.601 full range YUV to RGB reference (aka JPEG) -// * R = Y + V * 1.40200 -// * G = Y - U * 0.34414 - V * 0.71414 -// * B = Y + U * 1.77200 -// KR = 0.299; KB = 0.114 - -// U and V contributions to R,G,B. -#define UB 113 /* round(1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR 90 /* round(1.40200 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.709 limited range YUV to RGB reference -// R = (Y - 16) * 1.164 + V * 1.793 -// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 -// B = (Y - 16) * 1.164 + U * 2.112 -// KR = 0.2126, KB = 0.0722 - -// U and V contributions to R,G,B. -#ifdef LIBYUV_UNLIMITED_DATA -#define UB 135 /* round(2.112 * 64) */ -#else -#define UB 128 /* max(128, round(2.112 * 64)) */ -#endif -#define UG 14 /* round(0.213 * 64) */ -#define VG 34 /* round(0.533 * 64) */ -#define VR 115 /* round(1.793 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.709 full range YUV to RGB reference -// R = Y + V * 1.5748 -// G = Y - U * 0.18732 - V * 0.46812 -// B = Y + U * 1.8556 -// KR = 0.2126, KB = 0.0722 - -// U and V contributions to R,G,B. -#define UB 119 /* round(1.8556 * 64) */ -#define UG 12 /* round(0.18732 * 64) */ -#define VG 30 /* round(0.46812 * 64) */ -#define VR 101 /* round(1.5748 * 64) */ - -// Y contribution to R,G,B. Scale and bias. (same as jpeg) -#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.2020 limited range YUV to RGB reference -// R = (Y - 16) * 1.164384 + V * 1.67867 -// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 -// B = (Y - 16) * 1.164384 + U * 2.14177 -// KR = 0.2627; KB = 0.0593 - -// U and V contributions to R,G,B. -#ifdef LIBYUV_UNLIMITED_DATA -#define UB 137 /* round(2.142 * 64) */ -#else -#define UB 128 /* max(128, round(2.142 * 64)) */ -#endif -#define UG 12 /* round(0.187326 * 64) */ -#define VG 42 /* round(0.65042 * 64) */ -#define VR 107 /* round(1.67867 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.2020 full range YUV to RGB reference -// R = Y + V * 1.474600 -// G = Y - U * 0.164553 - V * 0.571353 -// B = Y + U * 1.881400 -// KR = 0.2627; KB = 0.0593 - -#define UB 120 /* round(1.881400 * 64) */ -#define UG 11 /* round(0.164553 * 64) */ -#define VG 37 /* round(0.571353 * 64) */ -#define VR 94 /* round(1.474600 * 64) */ - -// Y contribution to R,G,B. Scale and bias. (same as jpeg) -#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -#undef BB -#undef BG -#undef BR - -#undef MAKEYUVCONSTANTS - -#if defined(__aarch64__) || defined(__arm__) -#define LOAD_YUV_CONSTANTS \ - int ub = yuvconstants->kUVCoeff[0]; \ - int vr = yuvconstants->kUVCoeff[1]; \ - int ug = yuvconstants->kUVCoeff[2]; \ - int vg = yuvconstants->kUVCoeff[3]; \ - int yg = yuvconstants->kRGBCoeffBias[0]; \ - int bb = yuvconstants->kRGBCoeffBias[1]; \ - int bg = yuvconstants->kRGBCoeffBias[2]; \ - int br = yuvconstants->kRGBCoeffBias[3] - -#define CALC_RGB16 \ - int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ - int b16 = y1 + (u * ub) - bb; \ - int g16 = y1 + bg - (u * ug + v * vg); \ - int r16 = y1 + (v * vr) - br -#else -#define LOAD_YUV_CONSTANTS \ - int ub = yuvconstants->kUVToB[0]; \ - int ug = yuvconstants->kUVToG[0]; \ - int vg = yuvconstants->kUVToG[1]; \ - int vr = yuvconstants->kUVToR[1]; \ - int yg = yuvconstants->kYToRgb[0]; \ - int yb = yuvconstants->kYBiasToRgb[0] - -#define CALC_RGB16 \ - int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ - int8_t ui = u; \ - int8_t vi = v; \ - ui -= 0x80; \ - vi -= 0x80; \ - int b16 = y1 + (ui * ub); \ - int g16 = y1 - (ui * ug + vi * vg); \ - int r16 = y1 + (vi * vr) -#endif - -// C reference code that mimics the YUV assembly. -// Reads 8 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel(uint8_t y, - uint8_t u, - uint8_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y * 0x0101; - CALC_RGB16; - *b = Clamp((int32_t)(b16) >> 6); - *g = Clamp((int32_t)(g16) >> 6); - *r = Clamp((int32_t)(r16) >> 6); -} - -// Reads 8 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel8_16(uint8_t y, - uint8_t u, - uint8_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y * 0x0101; - CALC_RGB16; - *b = b16; - *g = g16; - *r = r16; -} - -// C reference code that mimics the YUV 16 bit assembly. -// Reads 10 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel10_16(uint16_t y, - uint16_t u, - uint16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 6; - u = clamp255(u >> 2); - v = clamp255(v >> 2); - CALC_RGB16; - *b = b16; - *g = g16; - *r = r16; -} - -// C reference code that mimics the YUV 16 bit assembly. -// Reads 12 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel12_16(int16_t y, - int16_t u, - int16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 4; - u = clamp255(u >> 4); - v = clamp255(v >> 4); - CALC_RGB16; - *b = b16; - *g = g16; - *r = r16; -} - -// C reference code that mimics the YUV 10 bit assembly. -// Reads 10 bit YUV and clamps down to 8 bit RGB. -static __inline void YuvPixel10(uint16_t y, - uint16_t u, - uint16_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { - int b16; - int g16; - int r16; - YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); - *b = Clamp(b16 >> 6); - *g = Clamp(g16 >> 6); - *r = Clamp(r16 >> 6); -} - -// C reference code that mimics the YUV 12 bit assembly. -// Reads 12 bit YUV and clamps down to 8 bit RGB. -static __inline void YuvPixel12(uint16_t y, - uint16_t u, - uint16_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { - int b16; - int g16; - int r16; - YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); - *b = Clamp(b16 >> 6); - *g = Clamp(g16 >> 6); - *r = Clamp(r16 >> 6); -} - -// C reference code that mimics the YUV 16 bit assembly. -// Reads 16 bit YUV and leaves result as 8 bit. -static __inline void YuvPixel16_8(uint16_t y, - uint16_t u, - uint16_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y; - u = clamp255(u >> 8); - v = clamp255(v >> 8); - CALC_RGB16; - *b = Clamp((int32_t)(b16) >> 6); - *g = Clamp((int32_t)(g16) >> 6); - *r = Clamp((int32_t)(r16) >> 6); -} - -// C reference code that mimics the YUV 16 bit assembly. -// Reads 16 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel16_16(uint16_t y, - uint16_t u, - uint16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { - LOAD_YUV_CONSTANTS; - uint32_t y32 = y; - u = clamp255(u >> 8); - v = clamp255(v >> 8); - CALC_RGB16; - *b = b16; - *g = g16; - *r = r16; -} - -// C reference code that mimics the YUV assembly. -// Reads 8 bit YUV and leaves result as 8 bit. -static __inline void YPixel(uint8_t y, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) || defined(__arm__) - int yg = yuvconstants->kRGBCoeffBias[0]; - int ygb = yuvconstants->kRGBCoeffBias[4]; -#else - int ygb = yuvconstants->kYBiasToRgb[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp(((int32_t)(y1) + ygb) >> 6); - *g = Clamp(((int32_t)(y1) + ygb) >> 6); - *r = Clamp(((int32_t)(y1) + ygb) >> 6); -} - -void I444ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - src_y += 1; - src_u += 1; - src_v += 1; - rgb_buf += 4; // Advance 1 pixel. - } -} - -// Also used for 420 -void I422ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -// 10 bit YUV to ARGB -void I210ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void I410ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width; ++x) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - src_y += 1; - src_u += 1; - src_v += 1; - rgb_buf += 4; // Advance 1 pixels. - } -} - -void I210AlphaToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); - YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = clamp255(src_a[1] >> 2); - src_y += 2; - src_u += 1; - src_v += 1; - src_a += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); - } -} - -void I410AlphaToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - const uint16_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width; ++x) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); - src_y += 1; - src_u += 1; - src_v += 1; - src_a += 1; - rgb_buf += 4; // Advance 1 pixels. - } -} - -// 12 bit YUV to ARGB -void I212ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { - uint32_t ar30; - b = b >> 4; // convert 8 bit 10.6 to 10 bit. - g = g >> 4; - r = r >> 4; - b = Clamp10(b); - g = Clamp10(g); - r = Clamp10(r); - ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; - (*(uint32_t*)rgb_buf) = ar30; -} - -// 10 bit YUV to 10 bit AR30 -void I210ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width - 1; x += 2) { - YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf + 4, b, g, r); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - } -} - -// 12 bit YUV to 10 bit AR30 -void I212ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width - 1; x += 2) { - YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf + 4, b, g, r); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - } -} - -void I410ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width; ++x) { - YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - src_y += 1; - src_u += 1; - src_v += 1; - rgb_buf += 4; // Advance 1 pixel. - } -} - -// P210 has 10 bits in msb of 16 bit NV12 style layout. -void P210ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, - dst_argb + 2, yuvconstants); - dst_argb[3] = 255; - YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5, - dst_argb + 6, yuvconstants); - dst_argb[7] = 255; - src_y += 2; - src_uv += 2; - dst_argb += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, - dst_argb + 2, yuvconstants); - dst_argb[3] = 255; - } -} - -void P410ToARGBRow_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width; ++x) { - YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, - dst_argb + 2, yuvconstants); - dst_argb[3] = 255; - src_y += 1; - src_uv += 2; - dst_argb += 4; // Advance 1 pixels. - } -} - -void P210ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width - 1; x += 2) { - YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); - StoreAR30(dst_ar30, b, g, r); - YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); - StoreAR30(dst_ar30 + 4, b, g, r); - src_y += 2; - src_uv += 2; - dst_ar30 += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); - StoreAR30(dst_ar30, b, g, r); - } -} - -void P410ToAR30Row_C(const uint16_t* src_y, - const uint16_t* src_uv, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width; ++x) { - YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); - StoreAR30(dst_ar30, b, g, r); - src_y += 1; - src_uv += 2; - dst_ar30 += 4; // Advance 1 pixel. - } -} - -// 8 bit YUV to 10 bit AR30 -// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. -void I422ToAR30Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int b; - int g; - int r; - for (x = 0; x < width - 1; x += 2) { - YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf + 4, b, g, r); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); - StoreAR30(rgb_buf, b, g, r); - } -} - -void I444AlphaToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = src_a[0]; - src_y += 1; - src_u += 1; - src_v += 1; - src_a += 1; - rgb_buf += 4; // Advance 1 pixel. - } -} - -void I422AlphaToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = src_a[1]; - src_y += 2; - src_u += 1; - src_v += 1; - src_a += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = src_a[0]; - } -} - -void I422ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 6; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - } -} - -void I422ToARGB4444Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - uint8_t b0; - uint8_t g0; - uint8_t r0; - uint8_t b1; - uint8_t g1; - uint8_t r1; - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); - b0 = b0 >> 4; - g0 = g0 >> 4; - r0 = r0 >> 4; - b1 = b1 >> 4; - g1 = g1 >> 4; - r1 = r1 >> 4; - *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | - (g1 << 20) | (r1 << 24) | 0xf000f000; - src_y += 2; - src_u += 1; - src_v += 1; - dst_argb4444 += 4; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - b0 = b0 >> 4; - g0 = g0 >> 4; - r0 = r0 >> 4; - *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; - } -} - -void I422ToARGB1555Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - uint8_t b0; - uint8_t g0; - uint8_t r0; - uint8_t b1; - uint8_t g1; - uint8_t r1; - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 3; - r0 = r0 >> 3; - b1 = b1 >> 3; - g1 = g1 >> 3; - r1 = r1 >> 3; - *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | - (g1 << 21) | (r1 << 26) | 0x80008000; - src_y += 2; - src_u += 1; - src_v += 1; - dst_argb1555 += 4; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 3; - r0 = r0 >> 3; - *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; - } -} - -void I422ToRGB565Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint8_t b0; - uint8_t g0; - uint8_t r0; - uint8_t b1; - uint8_t g1; - uint8_t r1; - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - b1 = b1 >> 3; - g1 = g1 >> 2; - r1 = r1 >> 3; - *(uint32_t*)(dst_rgb565) = - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); - src_y += 2; - src_u += 1; - src_v += 1; - dst_rgb565 += 4; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); - } -} - -void NV12ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_uv += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void NV21ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_vu += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void NV12ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); - src_y += 2; - src_uv += 2; - rgb_buf += 6; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - } -} - -void NV21ToRGB24Row_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); - src_y += 2; - src_vu += 2; - rgb_buf += 6; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - } -} - -void NV12ToRGB565Row_C(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint8_t b0; - uint8_t g0; - uint8_t r0; - uint8_t b1; - uint8_t g1; - uint8_t r1; - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - b1 = b1 >> 3; - g1 = g1 >> 2; - r1 = r1 >> 3; - *(uint32_t*)(dst_rgb565) = - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); - src_y += 2; - src_uv += 2; - dst_rgb565 += 4; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); - b0 = b0 >> 3; - g0 = g0 >> 2; - r0 = r0 >> 3; - *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); - } -} - -void YUY2ToARGBRow_C(const uint8_t* src_yuy2, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_yuy2 += 4; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void UYVYToARGBRow_C(const uint8_t* src_uyvy, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_uyvy += 4; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void I422ToRGBARow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); - rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, - rgb_buf + 7, yuvconstants); - rgb_buf[4] = 255; - src_y += 2; - src_u += 1; - src_v += 1; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); - rgb_buf[0] = 255; - } -} - -void I400ToARGBRow_C(const uint8_t* src_y, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { - int x; - src += width - 1; - for (x = 0; x < width - 1; x += 2) { - dst[x] = src[0]; - dst[x + 1] = src[-1]; - src -= 2; - } - if (width & 1) { - dst[width - 1] = src[0]; - } -} - -void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - int x; - src_uv += (width - 1) << 1; - for (x = 0; x < width; ++x) { - dst_uv[0] = src_uv[0]; - dst_uv[1] = src_uv[1]; - src_uv -= 2; - dst_uv += 2; - } -} - -void MirrorSplitUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - src_uv += (width - 1) << 1; - for (x = 0; x < width - 1; x += 2) { - dst_u[x] = src_uv[0]; - dst_u[x + 1] = src_uv[-2]; - dst_v[x] = src_uv[1]; - dst_v[x + 1] = src_uv[-2 + 1]; - src_uv -= 4; - } - if (width & 1) { - dst_u[width - 1] = src_uv[0]; - dst_v[width - 1] = src_uv[1]; - } -} - -void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { - int x; - const uint32_t* src32 = (const uint32_t*)(src); - uint32_t* dst32 = (uint32_t*)(dst); - src32 += width - 1; - for (x = 0; x < width - 1; x += 2) { - dst32[x] = src32[0]; - dst32[x + 1] = src32[-1]; - src32 -= 2; - } - if (width & 1) { - dst32[width - 1] = src32[0]; - } -} - -void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - int x; - src_rgb24 += width * 3 - 3; - for (x = 0; x < width; ++x) { - uint8_t b = src_rgb24[0]; - uint8_t g = src_rgb24[1]; - uint8_t r = src_rgb24[2]; - dst_rgb24[0] = b; - dst_rgb24[1] = g; - dst_rgb24[2] = r; - src_rgb24 -= 3; - dst_rgb24 += 3; - } -} - -void SplitUVRow_C(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_u[x] = src_uv[0]; - dst_u[x + 1] = src_uv[2]; - dst_v[x] = src_uv[1]; - dst_v[x + 1] = src_uv[3]; - src_uv += 4; - } - if (width & 1) { - dst_u[width - 1] = src_uv[0]; - dst_v[width - 1] = src_uv[1]; - } -} - -void MergeUVRow_C(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = src_u[x]; - dst_uv[1] = src_v[x]; - dst_uv[2] = src_u[x + 1]; - dst_uv[3] = src_v[x + 1]; - dst_uv += 4; - } - if (width & 1) { - dst_uv[0] = src_u[width - 1]; - dst_uv[1] = src_v[width - 1]; - } -} - -void SplitRGBRow_C(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_r[x] = src_rgb[0]; - dst_g[x] = src_rgb[1]; - dst_b[x] = src_rgb[2]; - src_rgb += 3; - } -} - -void MergeRGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_rgb[0] = src_r[x]; - dst_rgb[1] = src_g[x]; - dst_rgb[2] = src_b[x]; - dst_rgb += 3; - } -} - -void SplitARGBRow_C(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_b[x] = src_argb[0]; - dst_g[x] = src_argb[1]; - dst_r[x] = src_argb[2]; - dst_a[x] = src_argb[3]; - src_argb += 4; - } -} - -void MergeARGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_argb[0] = src_b[x]; - dst_argb[1] = src_g[x]; - dst_argb[2] = src_r[x]; - dst_argb[3] = src_a[x]; - dst_argb += 4; - } -} - -void MergeXR30Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - assert(depth >= 10); - assert(depth <= 16); - int x; - int shift = depth - 10; - uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; - for (x = 0; x < width; ++x) { - uint32_t r = clamp1023(src_r[x] >> shift); - uint32_t g = clamp1023(src_g[x] >> shift); - uint32_t b = clamp1023(src_b[x] >> shift); - dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; - } -} - -void MergeAR64Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - assert(depth >= 1); - assert(depth <= 16); - int x; - int shift = 16 - depth; - int max = (1 << depth) - 1; - for (x = 0; x < width; ++x) { - dst_ar64[0] = ClampMax(src_b[x], max) << shift; - dst_ar64[1] = ClampMax(src_g[x], max) << shift; - dst_ar64[2] = ClampMax(src_r[x], max) << shift; - dst_ar64[3] = ClampMax(src_a[x], max) << shift; - dst_ar64 += 4; - } -} - -void MergeARGB16To8Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - assert(depth >= 8); - assert(depth <= 16); - int x; - int shift = depth - 8; - for (x = 0; x < width; ++x) { - dst_argb[0] = clamp255(src_b[x] >> shift); - dst_argb[1] = clamp255(src_g[x] >> shift); - dst_argb[2] = clamp255(src_r[x] >> shift); - dst_argb[3] = clamp255(src_a[x] >> shift); - dst_argb += 4; - } -} - -void MergeXR64Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - assert(depth >= 1); - assert(depth <= 16); - int x; - int shift = 16 - depth; - int max = (1 << depth) - 1; - for (x = 0; x < width; ++x) { - dst_ar64[0] = ClampMax(src_b[x], max) << shift; - dst_ar64[1] = ClampMax(src_g[x], max) << shift; - dst_ar64[2] = ClampMax(src_r[x], max) << shift; - dst_ar64[3] = 0xffff; - dst_ar64 += 4; - } -} - -void MergeXRGB16To8Row_C(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - assert(depth >= 8); - assert(depth <= 16); - int x; - int shift = depth - 8; - for (x = 0; x < width; ++x) { - dst_argb[0] = clamp255(src_b[x] >> shift); - dst_argb[1] = clamp255(src_g[x] >> shift); - dst_argb[2] = clamp255(src_r[x] >> shift); - dst_argb[3] = 0xff; - dst_argb += 4; - } -} - -void SplitXRGBRow_C(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_b[x] = src_argb[0]; - dst_g[x] = src_argb[1]; - dst_r[x] = src_argb[2]; - src_argb += 4; - } -} - -void MergeXRGBRow_C(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_argb[0] = src_b[x]; - dst_argb[1] = src_g[x]; - dst_argb[2] = src_r[x]; - dst_argb[3] = 255; - dst_argb += 4; - } -} - -// Convert lsb formats to msb, depending on sample depth. -void MergeUVRow_16_C(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - assert(depth >= 8); - assert(depth <= 16); - int x; - for (x = 0; x < width; ++x) { - dst_uv[0] = src_u[x] << shift; - dst_uv[1] = src_v[x] << shift; - dst_uv += 2; - } -} - -// Convert msb formats to lsb, depending on sample depth. -void SplitUVRow_16_C(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - int shift = 16 - depth; - int x; - assert(depth >= 8); - assert(depth <= 16); - for (x = 0; x < width; ++x) { - dst_u[x] = src_uv[0] >> shift; - dst_v[x] = src_uv[1] >> shift; - src_uv += 2; - } -} - -void MultiplyRow_16_C(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_y[x] = src_y[x] * scale; - } -} - -void DivideRow_16_C(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_y[x] = (src_y[x] * scale) >> 16; - } -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits -void Convert16To8Row_C(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - int x; - assert(scale >= 256); - assert(scale <= 32768); - - for (x = 0; x < width; ++x) { - dst_y[x] = clamp255((src_y[x] * scale) >> 16); - } -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 1024 = 10 bits -void Convert8To16Row_C(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - int x; - scale *= 0x0101; // replicates the byte. - for (x = 0; x < width; ++x) { - dst_y[x] = (src_y[x] * scale) >> 16; - } -} - -void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { - memcpy(dst, src, count); -} - -void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { - memcpy(dst, src, count * 2); -} - -void SetRow_C(uint8_t* dst, uint8_t v8, int width) { - memset(dst, v8, width); -} - -void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { - int x; - for (x = 0; x < width; ++x) { - memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32); - } -} - -// Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_C(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values, filtering 2 rows of YUY2. - int x; - for (x = 0; x < width; x += 2) { - dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; - dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; - src_yuy2 += 4; - dst_u += 1; - dst_v += 1; - } -} - -// Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_C(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - int x; - for (x = 0; x < width; x += 2) { - dst_u[0] = src_yuy2[1]; - dst_v[0] = src_yuy2[3]; - src_yuy2 += 4; - dst_u += 1; - dst_v += 1; - } -} - -// Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - // Output a row of Y values. - int x; - for (x = 0; x < width - 1; x += 2) { - dst_y[x] = src_yuy2[0]; - dst_y[x + 1] = src_yuy2[2]; - src_yuy2 += 4; - } - if (width & 1) { - dst_y[width - 1] = src_yuy2[0]; - } -} - -// Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_C(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - int x; - for (x = 0; x < width; x += 2) { - dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; - dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; - src_uyvy += 4; - dst_u += 1; - dst_v += 1; - } -} - -// Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_C(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - int x; - for (x = 0; x < width; x += 2) { - dst_u[0] = src_uyvy[0]; - dst_v[0] = src_uyvy[2]; - src_uyvy += 4; - dst_u += 1; - dst_v += 1; - } -} - -// Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - // Output a row of Y values. - int x; - for (x = 0; x < width - 1; x += 2) { - dst_y[x] = src_uyvy[1]; - dst_y[x + 1] = src_uyvy[3]; - src_uyvy += 4; - } - if (width & 1) { - dst_y[width - 1] = src_uyvy[1]; - } -} - -#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) - -// Blend src_argb over src_argb1 and store to dst_argb. -// dst_argb may be src_argb or src_argb1. -// This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint32_t fb = src_argb[0]; - uint32_t fg = src_argb[1]; - uint32_t fr = src_argb[2]; - uint32_t a = src_argb[3]; - uint32_t bb = src_argb1[0]; - uint32_t bg = src_argb1[1]; - uint32_t br = src_argb1[2]; - dst_argb[0] = BLEND(fb, bb, a); - dst_argb[1] = BLEND(fg, bg, a); - dst_argb[2] = BLEND(fr, br, a); - dst_argb[3] = 255u; - - fb = src_argb[4 + 0]; - fg = src_argb[4 + 1]; - fr = src_argb[4 + 2]; - a = src_argb[4 + 3]; - bb = src_argb1[4 + 0]; - bg = src_argb1[4 + 1]; - br = src_argb1[4 + 2]; - dst_argb[4 + 0] = BLEND(fb, bb, a); - dst_argb[4 + 1] = BLEND(fg, bg, a); - dst_argb[4 + 2] = BLEND(fr, br, a); - dst_argb[4 + 3] = 255u; - src_argb += 8; - src_argb1 += 8; - dst_argb += 8; - } - - if (width & 1) { - uint32_t fb = src_argb[0]; - uint32_t fg = src_argb[1]; - uint32_t fr = src_argb[2]; - uint32_t a = src_argb[3]; - uint32_t bb = src_argb1[0]; - uint32_t bg = src_argb1[1]; - uint32_t br = src_argb1[2]; - dst_argb[0] = BLEND(fb, bb, a); - dst_argb[1] = BLEND(fg, bg, a); - dst_argb[2] = BLEND(fr, br, a); - dst_argb[3] = 255u; - } -} -#undef BLEND - -#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 -void BlendPlaneRow_C(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst[0] = UBLEND(src0[0], src1[0], alpha[0]); - dst[1] = UBLEND(src0[1], src1[1], alpha[1]); - src0 += 2; - src1 += 2; - alpha += 2; - dst += 2; - } - if (width & 1) { - dst[0] = UBLEND(src0[0], src1[0], alpha[0]); - } -} -#undef UBLEND - -#if defined(__aarch64__) || defined(__arm__) -#define ATTENUATE(f, a) (f * a + 128) >> 8 -#else -// This code mimics the SSSE3 version for better testability. -#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 -#endif - -// Multiply source RGB by alpha and store to destination. -void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - uint32_t b = src_argb[0]; - uint32_t g = src_argb[1]; - uint32_t r = src_argb[2]; - uint32_t a = src_argb[3]; - dst_argb[0] = ATTENUATE(b, a); - dst_argb[1] = ATTENUATE(g, a); - dst_argb[2] = ATTENUATE(r, a); - dst_argb[3] = a; - b = src_argb[4]; - g = src_argb[5]; - r = src_argb[6]; - a = src_argb[7]; - dst_argb[4] = ATTENUATE(b, a); - dst_argb[5] = ATTENUATE(g, a); - dst_argb[6] = ATTENUATE(r, a); - dst_argb[7] = a; - src_argb += 8; - dst_argb += 8; - } - - if (width & 1) { - const uint32_t b = src_argb[0]; - const uint32_t g = src_argb[1]; - const uint32_t r = src_argb[2]; - const uint32_t a = src_argb[3]; - dst_argb[0] = ATTENUATE(b, a); - dst_argb[1] = ATTENUATE(g, a); - dst_argb[2] = ATTENUATE(r, a); - dst_argb[3] = a; - } -} -#undef ATTENUATE - -// Divide source RGB by alpha and store to destination. -// b = (b * 255 + (a / 2)) / a; -// g = (g * 255 + (a / 2)) / a; -// r = (r * 255 + (a / 2)) / a; -// Reciprocal method is off by 1 on some values. ie 125 -// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. -#define T(a) 0x01000000 + (0x10000 / a) -const uint32_t fixed_invtbl8[256] = { - 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), - T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), - T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), - T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), - T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), - T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), - T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), - T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), - T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), - T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), - T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), - T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), - T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), - T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), - T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), - T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), - T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), - T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), - T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), - T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), - T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), - T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), - T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), - T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), - T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), - T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), - T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), - T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), - T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), - T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), - T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), - T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), - T(0xfc), T(0xfd), T(0xfe), 0x01000100}; -#undef T - -void ARGBUnattenuateRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - int i; - for (i = 0; i < width; ++i) { - uint32_t b = src_argb[0]; - uint32_t g = src_argb[1]; - uint32_t r = src_argb[2]; - const uint32_t a = src_argb[3]; - const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point - b = (b * ia) >> 8; - g = (g * ia) >> 8; - r = (r * ia) >> 8; - // Clamping should not be necessary but is free in assembly. - dst_argb[0] = clamp255(b); - dst_argb[1] = clamp255(g); - dst_argb[2] = clamp255(r); - dst_argb[3] = a; - src_argb += 4; - dst_argb += 4; - } -} - -void ComputeCumulativeSumRow_C(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - int32_t row_sum[4] = {0, 0, 0, 0}; - int x; - for (x = 0; x < width; ++x) { - row_sum[0] += row[x * 4 + 0]; - row_sum[1] += row[x * 4 + 1]; - row_sum[2] += row[x * 4 + 2]; - row_sum[3] += row[x * 4 + 3]; - cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; - cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; - cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; - cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; - } -} - -void CumulativeSumToAverageRow_C(const int32_t* tl, - const int32_t* bl, - int w, - int area, - uint8_t* dst, - int count) { - float ooa = 1.0f / area; - int i; - for (i = 0; i < count; ++i) { - dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); - dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); - dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); - dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); - dst += 4; - tl += 4; - bl += 4; - } -} - -// Copy pixels from rotated source to destination row with a slope. -LIBYUV_API -void ARGBAffineRow_C(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width) { - int i; - // Render a row of pixels from source into a buffer. - float uv[2]; - uv[0] = uv_dudv[0]; - uv[1] = uv_dudv[1]; - for (i = 0; i < width; ++i) { - int x = (int)(uv[0]); - int y = (int)(uv[1]); - *(uint32_t*)(dst_argb) = - *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); - dst_argb += 4; - uv[0] += uv_dudv[2]; - uv[1] += uv_dudv[3]; - } -} - -// Blend 2 rows into 1. -static void HalfRow_C(const uint8_t* src_uv, - ptrdiff_t src_uv_stride, - uint8_t* dst_uv, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; - } -} - -static void HalfRow_16_C(const uint16_t* src_uv, - ptrdiff_t src_uv_stride, - uint16_t* dst_uv, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; - } -} - -// C version 2x2 -> 2x1. -void InterpolateRow_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* src_ptr1 = src_ptr + src_stride; - int x; - if (y1_fraction == 0) { - memcpy(dst_ptr, src_ptr, width); - return; - } - if (y1_fraction == 128) { - HalfRow_C(src_ptr, src_stride, dst_ptr, width); - return; - } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; - dst_ptr[1] = - (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; - } -} - -void InterpolateRow_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint16_t* src_ptr1 = src_ptr + src_stride; - int x; - if (source_y_fraction == 0) { - memcpy(dst_ptr, src_ptr, width * 2); - return; - } - if (source_y_fraction == 128) { - HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); - return; - } - for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - } -} - -// Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - int index0 = shuffler[0]; - int index1 = shuffler[1]; - int index2 = shuffler[2]; - int index3 = shuffler[3]; - // Shuffle a row of ARGB. - int x; - for (x = 0; x < width; ++x) { - // To support in-place conversion. - uint8_t b = src_argb[index0]; - uint8_t g = src_argb[index1]; - uint8_t r = src_argb[index2]; - uint8_t a = src_argb[index3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - src_argb += 4; - dst_argb += 4; - } -} - -void I422ToYUY2Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_frame[0] = src_y[0]; - dst_frame[1] = src_u[0]; - dst_frame[2] = src_y[1]; - dst_frame[3] = src_v[0]; - dst_frame += 4; - src_y += 2; - src_u += 1; - src_v += 1; - } - if (width & 1) { - dst_frame[0] = src_y[0]; - dst_frame[1] = src_u[0]; - dst_frame[2] = 0; - dst_frame[3] = src_v[0]; - } -} - -void I422ToUYVYRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_frame[0] = src_u[0]; - dst_frame[1] = src_y[0]; - dst_frame[2] = src_v[0]; - dst_frame[3] = src_y[1]; - dst_frame += 4; - src_y += 2; - src_u += 1; - src_v += 1; - } - if (width & 1) { - dst_frame[0] = src_u[0]; - dst_frame[1] = src_y[0]; - dst_frame[2] = src_v[0]; - dst_frame[3] = 0; - } -} - -void ARGBPolynomialRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - int i; - for (i = 0; i < width; ++i) { - float b = (float)(src_argb[0]); - float g = (float)(src_argb[1]); - float r = (float)(src_argb[2]); - float a = (float)(src_argb[3]); - float b2 = b * b; - float g2 = g * g; - float r2 = r * r; - float a2 = a * a; - float db = poly[0] + poly[4] * b; - float dg = poly[1] + poly[5] * g; - float dr = poly[2] + poly[6] * r; - float da = poly[3] + poly[7] * a; - float b3 = b2 * b; - float g3 = g2 * g; - float r3 = r2 * r; - float a3 = a2 * a; - db += poly[8] * b2; - dg += poly[9] * g2; - dr += poly[10] * r2; - da += poly[11] * a2; - db += poly[12] * b3; - dg += poly[13] * g3; - dr += poly[14] * r3; - da += poly[15] * a3; - - dst_argb[0] = Clamp((int32_t)(db)); - dst_argb[1] = Clamp((int32_t)(dg)); - dst_argb[2] = Clamp((int32_t)(dr)); - dst_argb[3] = Clamp((int32_t)(da)); - src_argb += 4; - dst_argb += 4; - } -} - -// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor -// adjust the source integer range to the half float range desired. - -// This magic constant is 2^-112. Multiplying by this -// is the same as subtracting 112 from the exponent, which -// is the difference in exponent bias between 32-bit and -// 16-bit floats. Once we've done this subtraction, we can -// simply extract the low bits of the exponent and the high -// bits of the mantissa from our float and we're done. - -// Work around GCC 7 punning warning -Wstrict-aliasing -#if defined(__GNUC__) -typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; -#else -typedef uint32_t uint32_alias_t; -#endif - -void HalfFloatRow_C(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - int i; - float mult = 1.9259299444e-34f * scale; - for (i = 0; i < width; ++i) { - float value = src[i] * mult; - dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); - } -} - -void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { - int i; - for (i = 0; i < width; ++i) { - float value = src[i] * scale; - dst[i] = value; - } -} - -void ARGBLumaColorTableRow_C(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - uint32_t bc = lumacoeff & 0xff; - uint32_t gc = (lumacoeff >> 8) & 0xff; - uint32_t rc = (lumacoeff >> 16) & 0xff; - - int i; - for (i = 0; i < width - 1; i += 2) { - // Luminance in rows, color values in columns. - const uint8_t* luma0 = - ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + - luma; - const uint8_t* luma1; - dst_argb[0] = luma0[src_argb[0]]; - dst_argb[1] = luma0[src_argb[1]]; - dst_argb[2] = luma0[src_argb[2]]; - dst_argb[3] = src_argb[3]; - luma1 = - ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + - luma; - dst_argb[4] = luma1[src_argb[4]]; - dst_argb[5] = luma1[src_argb[5]]; - dst_argb[6] = luma1[src_argb[6]]; - dst_argb[7] = src_argb[7]; - src_argb += 8; - dst_argb += 8; - } - if (width & 1) { - // Luminance in rows, color values in columns. - const uint8_t* luma0 = - ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + - luma; - dst_argb[0] = luma0[src_argb[0]]; - dst_argb[1] = luma0[src_argb[1]]; - dst_argb[2] = luma0[src_argb[2]]; - dst_argb[3] = src_argb[3]; - } -} - -void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - dst[3] = src[3]; - dst[7] = src[7]; - dst += 8; - src += 8; - } - if (width & 1) { - dst[3] = src[3]; - } -} - -void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - dst_a[0] = src_argb[3]; - dst_a[1] = src_argb[7]; - dst_a += 2; - src_argb += 8; - } - if (width & 1) { - dst_a[0] = src_argb[3]; - } -} - -void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { - int i; - for (i = 0; i < width - 1; i += 2) { - dst[3] = src[0]; - dst[7] = src[1]; - dst += 8; - src += 2; - } - if (width & 1) { - dst[3] = src[0]; - } -} - -// Maximum temporary width for wrappers to process at a time, in pixels. -#define MAXTWIDTH 2048 - -#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ - defined(HAS_I422TORGB565ROW_SSSE3) -// row_win.cc has asm version, but GCC uses 2 step wrapper. -void I422ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TOARGB1555ROW_SSSE3) -void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_argb1555 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TOARGB4444ROW_SSSE3) -void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_argb4444 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV12TORGB565ROW_SSSE3) -void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); - src_y += twidth; - src_uv += twidth; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV12TORGB24ROW_SSSE3) -void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); - src_y += twidth; - src_uv += twidth; - dst_rgb24 += twidth * 3; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV21TORGB24ROW_SSSE3) -void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); - src_y += twidth; - src_vu += twidth; - dst_rgb24 += twidth * 3; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV12TORGB24ROW_AVX2) -void NV12ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) - ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif - src_y += twidth; - src_uv += twidth; - dst_rgb24 += twidth * 3; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV21TORGB24ROW_AVX2) -void NV21ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) - ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif - src_y += twidth; - src_vu += twidth; - dst_rgb24 += twidth * 3; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TORGB565ROW_AVX2) -void I422ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB565ROW_AVX2) - ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); -#else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); -#endif - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TOARGB1555ROW_AVX2) -void I422ToARGB1555Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTOARGB1555ROW_AVX2) - ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); -#else - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); -#endif - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_argb1555 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TOARGB4444ROW_AVX2) -void I422ToARGB4444Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTOARGB4444ROW_AVX2) - ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); -#else - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); -#endif - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_argb4444 += twidth * 2; - width -= twidth; - } -} -#endif - -#if defined(HAS_I422TORGB24ROW_AVX2) -void I422ToRGB24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) - ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif - src_y += twidth; - src_u += twidth / 2; - src_v += twidth / 2; - dst_rgb24 += twidth * 3; - width -= twidth; - } -} -#endif - -#if defined(HAS_NV12TORGB565ROW_AVX2) -void NV12ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB565ROW_AVX2) - ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); -#else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); -#endif - src_y += twidth; - src_uv += twidth; - dst_rgb565 += twidth * 2; - width -= twidth; - } -} -#endif - -#ifdef HAS_RGB24TOYJROW_AVX2 -// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. -void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); - ARGBToYJRow_AVX2(row, dst_yj, twidth); - src_rgb24 += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RGB24TOYJROW_AVX2 - -#ifdef HAS_RAWTOYJROW_AVX2 -// Convert 16 RAW pixels (64 bytes) to 16 YJ values. -void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RAWToARGBRow_SSSE3(src_raw, row, twidth); - ARGBToYJRow_AVX2(row, dst_yj, twidth); - src_raw += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RAWTOYJROW_AVX2 - -#ifdef HAS_RGB24TOYJROW_SSSE3 -// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. -void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); - ARGBToYJRow_SSSE3(row, dst_yj, twidth); - src_rgb24 += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RGB24TOYJROW_SSSE3 - -#ifdef HAS_RAWTOYJROW_SSSE3 -// Convert 16 RAW pixels (64 bytes) to 16 YJ values. -void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RAWToARGBRow_SSSE3(src_raw, row, twidth); - ARGBToYJRow_SSSE3(row, dst_yj, twidth); - src_raw += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RAWTOYJROW_SSSE3 - -float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { - float fsum = 0.f; - int i; - for (i = 0; i < width; ++i) { - float v = *src++; - fsum += v * v; - *dst++ = v * scale; - } - return fsum; -} - -float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { - float fmax = 0.f; - int i; - for (i = 0; i < width; ++i) { - float v = *src++; - float vs = v * scale; - fmax = (v > fmax) ? v : fmax; - *dst++ = vs; - } - return fmax; -} - -void ScaleSamples_C(const float* src, float* dst, float scale, int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = *src++ * scale; - } -} - -void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = - (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; - ++src; - } -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_C(const uint16_t* src0, - const uint16_t* src1, - const uint16_t* src2, - const uint16_t* src3, - const uint16_t* src4, - uint32_t* dst, - int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; - } -} - -void GaussRow_F32_C(const float* src, float* dst, int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * - (1.0f / 256.0f); - ++src; - } -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_F32_C(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, - int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; - } -} - -// Convert biplanar NV21 to packed YUV24 -void NV21ToYUV24Row_C(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_yuv24[0] = src_vu[0]; // V - dst_yuv24[1] = src_vu[1]; // U - dst_yuv24[2] = src_y[0]; // Y0 - dst_yuv24[3] = src_vu[0]; // V - dst_yuv24[4] = src_vu[1]; // U - dst_yuv24[5] = src_y[1]; // Y1 - src_y += 2; - src_vu += 2; - dst_yuv24 += 6; // Advance 2 pixels. - } - if (width & 1) { - dst_yuv24[0] = src_vu[0]; // V - dst_yuv24[1] = src_vu[1]; // U - dst_yuv24[2] = src_y[0]; // Y0 - } -} - -// Filter 2 rows of AYUV UV's (444) into UV (420). -// AYUV is VUYA in memory. UV for NV12 is UV order in memory. -void AYUVToUVRow_C(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - // Output a row of UV values, filtering 2x2 rows of AYUV. - int x; - for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 5] + 2) >> - 2; - dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 4] + 2) >> - 2; - src_ayuv += 8; - dst_uv += 2; - } - if (width & 1) { - dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; - dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; - } -} - -// Filter 2 rows of AYUV UV's (444) into VU (420). -void AYUVToVURow_C(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - // Output a row of VU values, filtering 2x2 rows of AYUV. - int x; - for (x = 0; x < width - 1; x += 2) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 4] + 2) >> - 2; - dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 5] + 2) >> - 2; - src_ayuv += 8; - dst_vu += 2; - } - if (width & 1) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; - dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; - } -} - -// Copy row of AYUV Y's into Y -void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - // Output a row of Y values. - int x; - for (x = 0; x < width; ++x) { - dst_y[x] = src_ayuv[2]; // v,u,y,a - src_ayuv += 4; - } -} - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t u = src_uv[0]; - uint8_t v = src_uv[1]; - dst_vu[0] = v; - dst_vu[1] = u; - src_uv += 2; - dst_vu += 2; - } -} - -void HalfMergeUVRow_C(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + - src_u[src_stride_u + 1] + 2) >> - 2; - dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + - src_v[src_stride_v + 1] + 2) >> - 2; - src_u += 2; - src_v += 2; - dst_uv += 2; - } - if (width & 1) { - dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; - dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_gcc.cc b/thirdparty/libyuv/source/row_gcc.cc deleted file mode 100644 index 43e4c71..0000000 --- a/thirdparty/libyuv/source/row_gcc.cc +++ /dev/null @@ -1,9195 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) - -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) - -// Constants for ARGB -static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, - 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; - -// JPeg full range. -static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, - 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; - -static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, - 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) - -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; - -static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, - -18, -94, 112, 0, -18, -94, 112, 0}; - -static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; - -// Constants for BGRA -static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, - 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; - -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - -// Constants for ABGR -static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, - 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; - -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - -// Constants for RGBA. -static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, - 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; - -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - -static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, - 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; - -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; - -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) - -#ifdef HAS_RGB24TOARGBROW_SSSE3 - -// Shuffle table for converting RGB24 to ARGB. -static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; - -// Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; - -// Shuffle table for converting RAW to RGBA. -static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u, - 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u}; - -// Shuffle table for converting RAW to RGB24. First 8. -static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Middle 8. -static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Last 8. -static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RGB24. -static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RAW. -static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; - -// YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, - 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, - 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - -// YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, - 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, - 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; - -// UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, - 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, - 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - -// UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, - 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, - 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; - -// NV21 shuf 8 VU to 16 UV. -static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, -}; -#endif // HAS_RGB24TOARGBROW_SSSE3 - -#ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_J400TOARGBROW_SSE2 - -#ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -// Same code as RAWToARGB with different shuffler and A in low bits -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGBA) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - - "movdqa %3,%%xmm6 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - - "movdqa %3,%%xmm6 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#ifdef HAS_ARGBTORGB24ROW_AVX2 -// vpermd for 12+12 to 24 -static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; - -void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24), // %3 - "m"(kPermdRGB24_AVX) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI -// Shuffle table for converting ARGBToRGB24 -static const ulvec8 kPermARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, - 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, - 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; -static const ulvec8 kPermARGBToRGB24_1 = { - 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, - 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, - 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; -static const ulvec8 kPermARGBToRGB24_2 = { - 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, - 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, - 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; - -void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vmovdqa %3,%%ymm5 \n" - "vmovdqa %4,%%ymm6 \n" - "vmovdqa %5,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" - "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" - "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kPermARGBToRGB24_0), // %3 - "m"(kPermARGBToRGB24_1), // %4 - "m"(kPermARGBToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); -} -#endif - -#ifdef HAS_ARGBTORAWROW_AVX2 -void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" - "vmovdqa %4,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 - "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes - "vpermd %%ymm1,%%ymm7,%%ymm1 \n" - "vpermd %%ymm2,%%ymm7,%%ymm2 \n" - "vpermd %%ymm3,%%ymm7,%%ymm3 \n" - "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 - "vpermq $0x4f,%%ymm2,%%ymm4 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 - "vpermq $0x93,%%ymm3,%%ymm3 \n" - "vpor %%ymm3,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,0x40(%1) \n" - "lea 0x60(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW), // %3 - "m"(kPermdRGB24_AVX) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - const uint32_t dither4, - int width) { - asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, - uint8_t* dst, - const uint32_t dither4, - int width) { - asm volatile( - "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTORGB565DITHERROW_AVX2 - -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); -} - -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_RGB24TOARGBROW_SSSE3 - -/* - -ARGBToAR30Row: - -Red Blue -With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will -produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats -wanted for the blue channel. The red needs to be shifted 4 left, so multiply by -(1024+4)*16 for red. - -Alpha Green -Alpha and Green are already in the high bits so vpand can zero out the other -bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier -could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha -would be a simple multiplier to shift it into position. It wants a gap of 10 -above the green. Green is 10 bits, so there are 6 bits in the low short. 4 -more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, -and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the -result left 10 to position the A and G channels. -*/ - -// Shuffle table for converting RAW to RGB24. Last 8. -static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, - 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; - -static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, - 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; - -static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; -static const uint32_t kMaskRB10 = 0x3ff003ff; -static const uint32_t kMaskAG10 = 0xc000ff00; -static const uint32_t kMulAG10 = 64 * 65536 + 1028; - -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleRB30), // %3 - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleBR30), // %3 reversed shuffler - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#ifdef HAS_ARGBTOAR30ROW_AVX2 -void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleRB30), // %3 - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_ABGRTOAR30ROW_AVX2 -void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB - "vbroadcastss %4,%%ymm3 \n" // multipler for RB - "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 - "vbroadcastss %6,%%ymm5 \n" // mask for AG - "vbroadcastss %7,%%ymm6 \n" // multipler for AG - "sub %0,%1 \n" - - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels - "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 - "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 - "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 - "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 - "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 - "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels - "add $0x20,%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleBR30), // %3 reversed shuffler - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - -static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, - 6, 6, 5, 5, 4, 4, 7, 7}; -static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, - 14, 14, 13, 13, 12, 12, 15, 15}; - -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - - "movdqa %3,%%xmm2 \n" - "movdqa %4,%%xmm3 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm0 \n" - "pshufb %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} - -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile( - - "movdqa %3,%%xmm2 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} - -#ifdef HAS_ARGBTOAR64ROW_AVX2 -void ARGBToAR64Row_AVX2(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x40(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif - -#ifdef HAS_ARGBTOAB64ROW_AVX2 -void ARGBToAB64Row_AVX2(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - - "vbroadcastf128 %3,%%ymm2 \n" - "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" - "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x40(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_AR64TOARGBROW_AVX2 -void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpsrlw $8,%%ymm0,%%ymm0 \n" - "vpsrlw $8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x40(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif - -#ifdef HAS_AB64TOARGBROW_AVX2 -void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile( - - "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpsrlw $8,%%ymm0,%%ymm0 \n" - "vpsrlw $8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x40(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -// clang-format off - -// TODO(mraptis): Consider passing R, G, B multipliers as parameter. -// round parameter is register containing value to add before shift. -#define RGBTOY(round) \ - "1: \n" \ - "movdqu (%0),%%xmm0 \n" \ - "movdqu 0x10(%0),%%xmm1 \n" \ - "movdqu 0x20(%0),%%xmm2 \n" \ - "movdqu 0x30(%0),%%xmm3 \n" \ - "psubb %%xmm5,%%xmm0 \n" \ - "psubb %%xmm5,%%xmm1 \n" \ - "psubb %%xmm5,%%xmm2 \n" \ - "psubb %%xmm5,%%xmm3 \n" \ - "movdqu %%xmm4,%%xmm6 \n" \ - "pmaddubsw %%xmm0,%%xmm6 \n" \ - "movdqu %%xmm4,%%xmm0 \n" \ - "pmaddubsw %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm4,%%xmm1 \n" \ - "pmaddubsw %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm4,%%xmm2 \n" \ - "pmaddubsw %%xmm3,%%xmm2 \n" \ - "lea 0x40(%0),%0 \n" \ - "phaddw %%xmm0,%%xmm6 \n" \ - "phaddw %%xmm2,%%xmm1 \n" \ - "prefetcht0 1280(%0) \n" \ - "paddw %%" #round ",%%xmm6 \n" \ - "paddw %%" #round ",%%xmm1 \n" \ - "psrlw $0x8,%%xmm6 \n" \ - "psrlw $0x8,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm6 \n" \ - "movdqu %%xmm6,(%1) \n" \ - "lea 0x10(%1),%1 \n" \ - "sub $0x10,%2 \n" \ - "jg 1b \n" - -#define RGBTOY_AVX2(round) \ - "1: \n" \ - "vmovdqu (%0),%%ymm0 \n" \ - "vmovdqu 0x20(%0),%%ymm1 \n" \ - "vmovdqu 0x40(%0),%%ymm2 \n" \ - "vmovdqu 0x60(%0),%%ymm3 \n" \ - "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ - "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ - "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ - "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ - "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ - "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ - "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ - "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ - "lea 0x80(%0),%0 \n" \ - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "prefetcht0 1280(%0) \n" \ - "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ - "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ - "vmovdqu %%ymm0,(%1) \n" \ - "lea 0x20(%1),%1 \n" \ - "sub $0x20,%2 \n" \ - "jg 1b \n" \ - "vzeroupper \n" - -// clang-format on - -#ifdef HAS_ARGBTOYROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOYROW_SSSE3 - -#ifdef HAS_ARGBTOYJROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN RGBTOY(xmm5) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBTOYJROW_SSSE3 - -#ifdef HAS_RGBATOYJROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN RGBTOY(xmm5) - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_RGBATOYJROW_SSSE3 - -#ifdef HAS_ARGBTOYROW_AVX2 -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; - -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm7) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOYROW_AVX2 - -#ifdef HAS_ABGRTOYROW_AVX2 -// Convert 32 ABGR pixels (128 bytes) to 32 Y values. -void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm7) - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ABGRTOYROW_AVX2 - -#ifdef HAS_ARGBTOYJROW_AVX2 -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm5) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOYJROW_AVX2 - -#ifdef HAS_RGBATOYJROW_AVX2 -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToYJ), // %3 - "m"(kSub128), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_RGBATOYJROW_AVX2 - -#ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} -#endif // HAS_ARGBTOUVROW_SSSE3 - -#ifdef HAS_ARGBTOUVROW_AVX2 -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ABGRTOUVROW_AVX2 -void ABGRToUVRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kAddUV128), // %5 - "m"(kABGRToV), // %6 - "m"(kABGRToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ABGRTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kSub128), // %5 - "m"(kARGBToVJ), // %6 - "m"(kARGBToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOUVJROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToVJ), // %5 - "m"(kARGBToUJ), // %6 - "m"(kSub128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} -#endif // HAS_ARGBTOUVJROW_SSSE3 - -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 - -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) - -// Read 8 UV from 444 -#define READYUV444 \ - "movq (%[u_buf]),%%xmm3 \n" \ - "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" - -// Read 4 UV from 422, upsample to 8 UV -#define READYUV422 \ - "movd (%[u_buf]),%%xmm3 \n" \ - "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" - -// Read 4 UV from 422 10 bit, upsample to 8 UV -// TODO(fbarchard): Consider shufb to replace pack/unpack -// TODO(fbarchard): Consider pmulhuw to replace psraw -// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. -#define READYUV210 \ - "movq (%[u_buf]),%%xmm3 \n" \ - "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklwd %%xmm1,%%xmm3 \n" \ - "psraw $2,%%xmm3 \n" \ - "packuswb %%xmm3,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $6,%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -#define READYUVA210 \ - "movq (%[u_buf]),%%xmm3 \n" \ - "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklwd %%xmm1,%%xmm3 \n" \ - "psraw $2,%%xmm3 \n" \ - "packuswb %%xmm3,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $6,%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" \ - "movdqu (%[a_buf]),%%xmm5 \n" \ - "psraw $2,%%xmm5 \n" \ - "packuswb %%xmm5,%%xmm5 \n" \ - "lea 0x10(%[a_buf]),%[a_buf] \n" - -// Read 8 UV from 444 10 bit -#define READYUV410 \ - "movdqu (%[u_buf]),%%xmm3 \n" \ - "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "psraw $2,%%xmm3 \n" \ - "psraw $2,%%xmm2 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm3 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $6,%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from 444 10 bit. With 8 Alpha. -#define READYUVA410 \ - "movdqu (%[u_buf]),%%xmm3 \n" \ - "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "psraw $2,%%xmm3 \n" \ - "psraw $2,%%xmm2 \n" \ - "movdqa %%xmm3,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm3 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x6,%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" \ - "movdqu (%[a_buf]),%%xmm5 \n" \ - "psraw $2,%%xmm5 \n" \ - "packuswb %%xmm5,%%xmm5 \n" \ - "lea 0x10(%[a_buf]),%[a_buf] \n" - -// Read 4 UV from 422 12 bit, upsample to 8 UV -#define READYUV212 \ - "movq (%[u_buf]),%%xmm3 \n" \ - "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklwd %%xmm1,%%xmm3 \n" \ - "psraw $0x4,%%xmm3 \n" \ - "packuswb %%xmm3,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x4,%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - "movd (%[u_buf]),%%xmm3 \n" \ - "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" \ - "movq (%[a_buf]),%%xmm5 \n" \ - "lea 0x8(%[a_buf]),%[a_buf] \n" - -// Read 8 UV from 444. With 8 Alpha. -#define READYUVA444 \ - "movq (%[u_buf]),%%xmm3 \n" \ - "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" \ - "movq (%[a_buf]),%%xmm5 \n" \ - "lea 0x8(%[a_buf]),%[a_buf] \n" - -// Read 4 UV from NV12, upsample to 8 UV -#define READNV12 \ - "movq (%[uv_buf]),%%xmm3 \n" \ - "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" - -// Read 4 VU from NV21, upsample to 8 UV -#define READNV21 \ - "movq (%[vu_buf]),%%xmm3 \n" \ - "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm3 \n" \ - "movq (%[y_buf]),%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" - -// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. -#define READYUY2 \ - "movdqu (%[yuy2_buf]),%%xmm4 \n" \ - "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu (%[yuy2_buf]),%%xmm3 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ - "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" - -// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. -#define READUYVY \ - "movdqu (%[uyvy_buf]),%%xmm4 \n" \ - "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu (%[uyvy_buf]),%%xmm3 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ - "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" - -// Read 4 UV from P210, upsample to 8 UV -#define READP210 \ - "movdqu (%[uv_buf]),%%xmm3 \n" \ - "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ - "psrlw $0x8,%%xmm3 \n" \ - "packuswb %%xmm3,%%xmm3 \n" \ - "punpcklwd %%xmm3,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from P410 -#define READP410 \ - "movdqu (%[uv_buf]),%%xmm3 \n" \ - "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ - "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ - "psrlw $0x8,%%xmm3 \n" \ - "psrlw $0x8,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm3 \n" \ - "movdqu (%[y_buf]),%%xmm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -#if defined(__x86_64__) -#define YUVTORGB_SETUP(yuvconstants) \ - "pcmpeqb %%xmm13,%%xmm13 \n" \ - "movdqa (%[yuvconstants]),%%xmm8 \n" \ - "pxor %%xmm12,%%xmm12 \n" \ - "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ - "psllw $7,%%xmm13 \n" \ - "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ - "pshufb %%xmm12,%%xmm13 \n" \ - "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm12 \n" - -// Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB16(yuvconstants) \ - "psubb %%xmm13,%%xmm3 \n" \ - "pmulhuw %%xmm11,%%xmm4 \n" \ - "movdqa %%xmm8,%%xmm0 \n" \ - "movdqa %%xmm9,%%xmm1 \n" \ - "movdqa %%xmm10,%%xmm2 \n" \ - "paddw %%xmm12,%%xmm4 \n" \ - "pmaddubsw %%xmm3,%%xmm0 \n" \ - "pmaddubsw %%xmm3,%%xmm1 \n" \ - "pmaddubsw %%xmm3,%%xmm2 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psubsw %%xmm1,%%xmm4 \n" \ - "movdqa %%xmm4,%%xmm1 \n" - -#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", - -#else -#define YUVTORGB_SETUP(yuvconstants) -// Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB16(yuvconstants) \ - "pcmpeqb %%xmm0,%%xmm0 \n" \ - "pxor %%xmm1,%%xmm1 \n" \ - "psllw $7,%%xmm0 \n" \ - "pshufb %%xmm1,%%xmm0 \n" \ - "psubb %%xmm0,%%xmm3 \n" \ - "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ - "movdqa (%[yuvconstants]),%%xmm0 \n" \ - "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ - "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ - "pmaddubsw %%xmm3,%%xmm0 \n" \ - "pmaddubsw %%xmm3,%%xmm1 \n" \ - "pmaddubsw %%xmm3,%%xmm2 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ - "paddw %%xmm3,%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psubsw %%xmm1,%%xmm4 \n" \ - "movdqa %%xmm4,%%xmm1 \n" - -#define YUVTORGB_REGS -#endif - -#define YUVTORGB(yuvconstants) \ - YUVTORGB16(yuvconstants) \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" - -// Store 8 ARGB values. -#define STOREARGB \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklbw %%xmm5,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm0 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm0,(%[dst_argb]) \n" \ - "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ - "lea 0x20(%[dst_argb]), %[dst_argb] \n" - -// Store 8 RGBA values. -#define STORERGBA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ - "punpcklbw %%xmm2,%%xmm1 \n" \ - "punpcklbw %%xmm0,%%xmm5 \n" \ - "movdqa %%xmm5,%%xmm0 \n" \ - "punpcklwd %%xmm1,%%xmm5 \n" \ - "punpckhwd %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm5,(%[dst_rgba]) \n" \ - "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ - "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" - -// Store 8 AR30 values. -#define STOREAR30 \ - "psraw $0x4,%%xmm0 \n" \ - "psraw $0x4,%%xmm1 \n" \ - "psraw $0x4,%%xmm2 \n" \ - "pminsw %%xmm7,%%xmm0 \n" \ - "pminsw %%xmm7,%%xmm1 \n" \ - "pminsw %%xmm7,%%xmm2 \n" \ - "pmaxsw %%xmm6,%%xmm0 \n" \ - "pmaxsw %%xmm6,%%xmm1 \n" \ - "pmaxsw %%xmm6,%%xmm2 \n" \ - "psllw $0x4,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "punpcklwd %%xmm2,%%xmm0 \n" \ - "punpckhwd %%xmm2,%%xmm3 \n" \ - "movdqa %%xmm1,%%xmm2 \n" \ - "punpcklwd %%xmm5,%%xmm1 \n" \ - "punpckhwd %%xmm5,%%xmm2 \n" \ - "pslld $0xa,%%xmm1 \n" \ - "pslld $0xa,%%xmm2 \n" \ - "por %%xmm1,%%xmm0 \n" \ - "por %%xmm2,%%xmm3 \n" \ - "movdqu %%xmm0,(%[dst_ar30]) \n" \ - "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ - "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" - -void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUVA444 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_I444ALPHATOARGBROW_SSSE3 - -void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); -} - -void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -// 10 bit YUV to ARGB -void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -// 12 bit YUV to ARGB -void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -// 10 bit YUV to AR30 -void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -// 12 bit YUV to AR30 -void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -// 10 bit YUV to ARGB -void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - - LABELALIGN "1: \n" READYUVA210 - YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); -} -#endif - -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUVA410 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); - // clang-format on -} -#endif - -// 10 bit YUV to AR30 -void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_I422ALPHATOARGBROW_SSSE3 - -void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READNV12 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} - -void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READNV21 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [vu_buf]"+r"(vu_buf), // %[vu_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} - -void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUY2 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y]"m"(kShuffleYUY2Y), - [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} - -void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READUYVY - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY]"m"(kShuffleUYVYY), - [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} - -void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN "1: \n" READP210 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); -} - -void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN "1: \n" READP410 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); -} - -void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READP210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READP410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - -#endif // HAS_I422TOARGBROW_SSSE3 - -// Read 16 UV from 444 -#define READYUV444_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - "vmovq (%[u_buf]),%%xmm3 \n" \ - "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from 210, upsample to 16 UV -// TODO(fbarchard): Consider vshufb to replace pack/unpack -// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. -#define READYUV210_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpsraw $2,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. -#define READYUVA210_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpsraw $2,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" \ - "vmovdqu (%[a_buf]),%%ymm5 \n" \ - "vpsraw $2,%%ymm5,%%ymm5 \n" \ - "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ - "lea 0x20(%[a_buf]),%[a_buf] \n" - -// Read 16 UV from 410 -#define READYUV410_AVX2 \ - "vmovdqu (%[u_buf]),%%ymm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ - "lea 0x20(%[u_buf]),%[u_buf] \n" \ - "vpsraw $2,%%ymm3,%%ymm3 \n" \ - "vpsraw $2,%%ymm2,%%ymm2 \n" \ - "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ - "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from 212 12 bit, upsample to 16 UV -#define READYUV212_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpsraw $0x4,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" - -// Read 16 UV from 410. With 16 Alpha. -#define READYUVA410_AVX2 \ - "vmovdqu (%[u_buf]),%%ymm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ - "lea 0x20(%[u_buf]),%[u_buf] \n" \ - "vpsraw $2,%%ymm3,%%ymm3 \n" \ - "vpsraw $2,%%ymm2,%%ymm2 \n" \ - "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ - "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" \ - "vmovdqu (%[a_buf]),%%ymm5 \n" \ - "vpsraw $2,%%ymm5,%%ymm5 \n" \ - "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ - "lea 0x20(%[a_buf]),%[a_buf] \n" - -// Read 16 UV from 444. With 16 Alpha. -#define READYUVA444_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm3 \n" \ - "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" \ - "vmovdqu (%[a_buf]),%%xmm5 \n" \ - "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ - "lea 0x10(%[a_buf]),%[a_buf] \n" - -// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - "vmovq (%[u_buf]),%%xmm3 \n" \ - "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" \ - "vmovdqu (%[a_buf]),%%xmm5 \n" \ - "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ - "lea 0x10(%[a_buf]),%[a_buf] \n" - -// Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - "vmovdqu (%[uv_buf]),%%xmm3 \n" \ - "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 8 VU from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - "vmovdqu (%[vu_buf]),%%xmm3 \n" \ - "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea 0x10(%[y_buf]),%[y_buf] \n" - -// Read 4 UV from P210, upsample to 8 UV -#define READP210_AVX2 \ - "vmovdqu (%[uv_buf]),%%ymm3 \n" \ - "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ - "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" - -// Read 8 UV from P410 -#define READP410_AVX2 \ - "vmovdqu (%[uv_buf]),%%ymm3 \n" \ - "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ - "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ - "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ - "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "lea 0x20(%[y_buf]),%[y_buf] \n" - -// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ - "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \ - "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" - -// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ - "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ - "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" - -#if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ - "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ - "vpsllw $7,%%xmm13,%%xmm13 \n" \ - "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ - "vpbroadcastb %%xmm13,%%ymm13 \n" \ - "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ - "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ - "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" - -#define YUVTORGB16_AVX2(yuvconstants) \ - "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ - "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ - "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ - "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ - "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ - "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" - -#define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", - -#else // Convert 16 pixels: 16 UV and 16 Y. - -#define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB16_AVX2(yuvconstants) \ - "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ - "vpsllw $7,%%xmm0,%%xmm0 \n" \ - "vpbroadcastb %%xmm0,%%ymm0 \n" \ - "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ - "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ - "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ - "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ - "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ - "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ - "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ - "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ - "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" - -#define YUVTORGB_REGS_AVX2 -#endif - -#define YUVTORGB_AVX2(yuvconstants) \ - YUVTORGB16_AVX2(yuvconstants) \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" - -// Store 16 ARGB values. -#define STOREARGB_AVX2 \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ - "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ - "vmovdqu %%ymm1,(%[dst_argb]) \n" \ - "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ - "lea 0x40(%[dst_argb]), %[dst_argb] \n" - -// Store 16 AR30 values. -#define STOREAR30_AVX2 \ - "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ - "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ - "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ - "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ - "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ - "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ - "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ - "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ - "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ - "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ - "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ - "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ - "vpslld $0xa,%%ymm1,%%ymm1 \n" \ - "vpslld $0xa,%%ymm2,%%ymm2 \n" \ - "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ - "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ - "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ - "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" - -#ifdef HAS_I444TOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV444_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I444TOARGBROW_AVX2 - -#if defined(HAS_I422TOARGBROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV422_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I422TOARGBROW_AVX2 - -#if defined(HAS_I422TOAR30ROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). -void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READYUV422_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_I422TOAR30ROW_AVX2 - -#if defined(HAS_I210TOARGBROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV210_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I210TOARGBROW_AVX2 - -#if defined(HAS_I212TOARGBROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV212_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I212TOARGBROW_AVX2 - -#if defined(HAS_I210TOAR30ROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). -void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READYUV210_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_I210TOAR30ROW_AVX2 - -#if defined(HAS_I212TOAR30ROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). -void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READYUV212_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_I212TOAR30ROW_AVX2 - -#if defined(HAS_I410TOARGBROW_AVX2) -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV410_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I410TOARGBROW_AVX2 - -#if defined(HAS_I210ALPHATOARGBROW_AVX2) -// 16 pixels -// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). -void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - - LABELALIGN "1: \n" READYUVA210_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I210TOARGBROW_AVX2 - -#if defined(HAS_I410ALPHATOARGBROW_AVX2) -// 16 pixels -// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). -void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - - LABELALIGN "1: \n" READYUVA410_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I410TOARGBROW_AVX2 - -#if defined(HAS_I410TOAR30ROW_AVX2) -// 16 pixels -// 16 UV values with 16 Y producing 16 AR30 (64 bytes). -void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READYUV410_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_I410TOAR30ROW_AVX2 - -#if defined(HAS_I444ALPHATOARGBROW_AVX2) -// 16 pixels -// 16 UV values with 16 Y and 16 A producing 16 ARGB. -void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUVA444_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_I444ALPHATOARGBROW_AVX2 - -#if defined(HAS_I422ALPHATOARGBROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - - LABELALIGN - "1: \n" - READYUVA422_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_I422ALPHATOARGBROW_AVX2 - -#if defined(HAS_I422TORGBAROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUV422_AVX2 - YUVTORGB_AVX2(yuvconstants) - - // Step 3: Weave into RGBA - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" - "vmovdqu %%ymm0,(%[dst_argb]) \n" - "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" - "lea 0x40(%[dst_argb]),%[dst_argb] \n" - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I422TORGBAROW_AVX2 - -#if defined(HAS_NV12TOARGBROW_AVX2) -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READNV12_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_NV12TOARGBROW_AVX2 - -#if defined(HAS_NV21TOARGBROW_AVX2) -// 16 pixels. -// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READNV21_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [vu_buf]"+r"(vu_buf), // %[vu_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_NV21TOARGBROW_AVX2 - -#if defined(HAS_YUY2TOARGBROW_AVX2) -// 16 pixels. -// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READYUY2_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y]"m"(kShuffleYUY2Y), - [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_YUY2TOARGBROW_AVX2 - -#if defined(HAS_UYVYTOARGBROW_AVX2) -// 16 pixels. -// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READUYVY_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY]"m"(kShuffleUYVYY), - [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_UYVYTOARGBROW_AVX2 - -#if defined(HAS_P210TOARGBROW_AVX2) -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READP210_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_P210TOARGBROW_AVX2 - -#if defined(HAS_P410TOARGBROW_AVX2) -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - READP410_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on -} -#endif // HAS_P410TOARGBROW_AVX2 - -#if defined(HAS_P210TOAR30ROW_AVX2) -// 16 pixels -// 16 UV values with 16 Y producing 16 AR30 (64 bytes). -void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READP210_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_P210TOAR30ROW_AVX2 - -#if defined(HAS_P410TOAR30ROW_AVX2) -// 16 pixels -// 16 UV values with 16 Y producing 16 AR30 (64 bytes). -void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min - "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max - "vpsrlw $6,%%ymm7,%%ymm7 \n" - - LABELALIGN - "1: \n" - READP410_AVX2 - YUVTORGB16_AVX2(yuvconstants) - STOREAR30_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_P410TOAR30ROW_AVX2 - -#ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 - "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 - "pslld $0x18,%%xmm4 \n" - - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "paddsw %%xmm3,%%xmm0 \n" - "psraw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : "r"(yuvconstants) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_I400TOARGBROW_SSE2 - -#ifdef HAS_I400TOARGBROW_AVX2 -// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). -// note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 - "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 - "vpslld $0x18,%%ymm4,%%ymm4 \n" - - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 - "vmovdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsraw $0x6,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : "r"(yuvconstants) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_I400TOARGBROW_AVX2 - -#ifdef HAS_MIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "movdqa %3,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} -#endif // HAS_MIRRORROW_SSSE3 - -#ifdef HAS_MIRRORROW_AVX2 -void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "vbroadcastf128 %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} -#endif // HAS_MIRRORROW_AVX2 - -#ifdef HAS_MIRRORUVROW_SSSE3 -// Shuffle table for reversing the UV. -static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, - 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; - -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "movdqa %3,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} -#endif // HAS_MIRRORUVROW_SSSE3 - -#ifdef HAS_MIRRORUVROW_AVX2 -void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "vbroadcastf128 %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} -#endif // HAS_MIRRORUVROW_AVX2 - -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorSplitUV) // %4 - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_MIRRORSPLITUVROW_SSSE3 - -#ifdef HAS_RGB24MIRRORROW_SSSE3 - -// Shuffle first 5 pixels to last 5 mirrored. first byte zero -static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, - 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; - -// Shuffle last 5 pixels to first 5 mirrored. last byte zero -static const uvec8 kShuffleMirrorRGB1 = { - 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; - -// Shuffle 5 pixels at a time (15 bytes) -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - intptr_t temp_width = (intptr_t)(width); - src_rgb24 += width * 3 - 48; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // first 5 - "movdqu 15(%0),%%xmm1 \n" // next 5 - "movdqu 30(%0),%%xmm2 \n" // next 5 - "movdqu 32(%0),%%xmm3 \n" // last 1 special - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "lea -0x30(%0),%0 \n" - "movdqu %%xmm0,32(%1) \n" // last 5 - "movdqu %%xmm1,17(%1) \n" // next 5 - "movdqu %%xmm2,2(%1) \n" // next 5 - "movlpd %%xmm3,0(%1) \n" // first 1 - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorRGB0), // %3 - "m"(kShuffleMirrorRGB1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_RGB24MIRRORROW_SSSE3 - -#ifdef HAS_ARGBMIRRORROW_SSE2 - -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "lea -0x10(%0,%2,4),%0 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); -} -#endif // HAS_ARGBMIRRORROW_SSE2 - -#ifdef HAS_ARGBMIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile( - - "vmovdqu %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} -#endif // HAS_ARGBMIRRORROW_AVX2 - -#ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,0x00(%1,%2,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SPLITUVROW_AVX2 - -#ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SPLITUVROW_SSE2 - -#ifdef HAS_MERGEUVROW_AVX2 -void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile( - - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEUVROW_AVX2 - -#ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile( - - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEUVROW_SSE2 - -#ifdef HAS_MERGEUVROW_16_AVX2 -void MergeUVRow_16_AVX2(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - depth = 16 - depth; - // clang-format off - asm volatile ( - "vmovd %4,%%xmm3 \n" - "sub %0,%1 \n" - - // 16 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - - "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(depth) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); - // clang-format on -} -#endif // HAS_MERGEUVROW_AVX2 - -#ifdef HAS_SPLITUVROW_16_AVX2 -const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15}; -void SplitUVRow_16_AVX2(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - depth = 16 - depth; - // clang-format off - asm volatile ( - "vmovd %4,%%xmm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" - "sub %1,%2 \n" - - // 16 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - - "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x0,%%ymm1,0x10(%1) \n" - "vextractf128 $0x1,%%ymm0,(%1,%2) \n" - "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" - "add $0x20,%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(depth), // %4 - "m"(kSplitUVShuffle16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); - // clang-format on -} -#endif // HAS_SPLITUVROW_16_AVX2 - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits -#ifdef HAS_MULTIPLYROW_16_AVX2 -void MultiplyRow_16_AVX2(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm3"); - // clang-format on -} -#endif // HAS_MULTIPLYROW_16_AVX2 - -// Use scale to convert msb formats to lsb, depending how many bits there are: -// 512 = 9 bits -// 1024 = 10 bits -// 4096 = 12 bits -// 65536 = 16 bits -#ifdef HAS_DIVIDEROW_16_AVX2 -void DivideRow_16_AVX2(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width), // %2 - "+r"(scale) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm3"); - // clang-format on -} -#endif // HAS_MULTIPLYROW_16_AVX2 - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on -} - -#ifdef HAS_CONVERT16TO8ROW_AVX2 -void Convert16To8Row_AVX2(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "add $0x20,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on -} -#endif // HAS_CONVERT16TO8ROW_AVX2 - -// Use scale to convert to lsb formats depending how many bits there are: -// 512 = 9 bits -// 1024 = 10 bits -// 4096 = 12 bits -// TODO(fbarchard): reduce to SSE2 -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on -} - -#ifdef HAS_CONVERT8TO16ROW_AVX2 -void Convert8To16Row_AVX2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm2 \n" - "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - - // 32 pixels per loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on -} -#endif // HAS_CONVERT8TO16ROW_AVX2 - -#ifdef HAS_SPLITRGBROW_SSSE3 - -// Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, - 2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u}; - -static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, - 3u, 6u, 9u, 12u, 15u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 2u, - 5u, 8u, 11u, 14u}; - -static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, - 4u, 7u, 10u, 13u, 128u, 128u, - 128u, 128u, 128u, 128u}; -static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 0u, 3u, - 6u, 9u, 12u, 15u}; - -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskRGBToR0), // %5 - "m"(kShuffleMaskRGBToR1), // %6 - "m"(kShuffleMaskRGBToR2), // %7 - "m"(kShuffleMaskRGBToG0), // %8 - "m"(kShuffleMaskRGBToG1), // %9 - "m"(kShuffleMaskRGBToG2), // %10 - "m"(kShuffleMaskRGBToB0), // %11 - "m"(kShuffleMaskRGBToB1), // %12 - "m"(kShuffleMaskRGBToB2) // %13 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_SPLITRGBROW_SSSE3 - -#ifdef HAS_MERGERGBROW_SSSE3 - -// Shuffle table for converting RGB to Planar. -static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, - 2u, 128u, 128u, 3u, 128u, 128u, - 4u, 128u, 128u, 5u}; -static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, - 128u, 2u, 128u, 128u, 3u, 128u, - 128u, 4u, 128u, 128u}; -static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, - 128u, 128u, 2u, 128u, 128u, 3u, - 128u, 128u, 4u, 128u}; - -static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, - 7u, 128u, 128u, 8u, 128u, 128u, - 9u, 128u, 128u, 10u}; -static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, - 128u, 7u, 128u, 128u, 8u, 128u, - 128u, 9u, 128u, 128u}; -static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, - 128u, 128u, 8u, 128u, 128u, 9u, - 128u, 128u, 10u, 128u}; - -static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, - 12u, 128u, 128u, 13u, 128u, 128u, - 14u, 128u, 128u, 15u}; -static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, - 128u, 13u, 128u, 128u, 14u, 128u, - 128u, 15u, 128u, 128u}; -static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, - 128u, 128u, 13u, 128u, 128u, 14u, - 128u, 128u, 15u, 128u}; - -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %5, %%xmm0 \n" - "pshufb %6, %%xmm1 \n" - "pshufb %7, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %8, %%xmm0 \n" - "pshufb %9, %%xmm1 \n" - "pshufb %10, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb %11, %%xmm0 \n" - "pshufb %12, %%xmm1 \n" - "pshufb %13, %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskRToRGB0), // %5 - "m"(kShuffleMaskGToRGB0), // %6 - "m"(kShuffleMaskBToRGB0), // %7 - "m"(kShuffleMaskRToRGB1), // %8 - "m"(kShuffleMaskGToRGB1), // %9 - "m"(kShuffleMaskBToRGB1), // %10 - "m"(kShuffleMaskRToRGB2), // %11 - "m"(kShuffleMaskGToRGB2), // %12 - "m"(kShuffleMaskBToRGB2) // %13 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGERGBROW_SSSE3 - -#ifdef HAS_MERGEARGBROW_SSE2 -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - - LABELALIGN - "1: \n" - - "movq (%0,%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%0,%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "movq (%0,%3),%%xmm1 \n" // A - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%4) \n" - "movdqu %%xmm1,16(%4) \n" - - "lea 8(%0),%0 \n" - "lea 32(%4),%4 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_MERGEXRGBROW_SSE2 -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - - "movq (%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,16(%3) \n" - - "lea 8(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 32(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEARGBROW_SSE2 - -#ifdef HAS_MERGEARGBROW_AVX2 -void MergeARGBRow_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - - LABELALIGN - "1: \n" - - "vmovdqu (%0,%2),%%xmm0 \n" // B - "vmovdqu (%0,%1),%%xmm1 \n" // R - "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G - "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A - "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" - "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" - "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" - "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%4) \n" // First 8 - "vmovdqu %%ymm1,32(%4) \n" // Next 8 - - "lea 16(%0),%0 \n" - "lea 64(%4),%4 \n" - "sub $0x10,%5 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_MERGEXRGBROW_AVX2 -void MergeXRGBRow_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - - "vmovdqu (%2),%%xmm0 \n" // B - "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255) - "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R - "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G - "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" - "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" - "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" - "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3) \n" // First 8 - "vmovdqu %%ymm1,32(%3) \n" // Next 8 - - "lea 16(%0),%0 \n" - "lea 16(%1),%1 \n" - "lea 16(%2),%2 \n" - "lea 64(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEARGBROW_AVX2 - -#ifdef HAS_SPLITARGBROW_SSE2 -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+rm"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSE2 -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; -#ifdef HAS_SPLITARGBROW_SSSE3 -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - - "movdqa %6,%%xmm3 \n" - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "subl $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(kShuffleMaskARGBSplit) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSSE3 -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - - "movdqa %5,%%xmm3 \n" - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskARGBSplit) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif - -#ifdef HAS_SPLITARGBROW_AVX2 -static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; -void SplitARGBRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - "vmovdqa %7,%%ymm3 \n" - "vbroadcastf128 %6,%%ymm4 \n" - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 00-0F - "vmovdqu 16(%0),%%xmm1 \n" // 10-1F - "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F - "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F - "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" - "vpermd %%ymm0,%%ymm3,%%ymm0 \n" - "vpermd %%ymm1,%%ymm3,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR - "vmovdqu %%xmm0,(%1,%3) \n" // B - "vextracti128 $1,%%ymm0,(%1) \n" // R - "vmovdqu %%xmm2,(%1,%2) \n" // G - "vextracti128 $1,%%ymm2,(%1,%4) \n" // A - "lea 64(%0),%0 \n" - "lea 16(%1),%1 \n" - "subl $0x10,%5 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(kShuffleMaskARGBSplit), // %6 - "m"(kShuffleMaskARGBPermute) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_AVX2 -void SplitXRGBRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - - "vmovdqa %6,%%ymm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 00-0F - "vmovdqu 16(%0),%%xmm1 \n" // 10-1F - "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F - "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F - "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" - "vpermd %%ymm0,%%ymm3,%%ymm0 \n" - "vpermd %%ymm1,%%ymm3,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR - "vmovdqu %%xmm0,(%3) \n" // B - "vextracti128 $1,%%ymm0,(%1) \n" // R - "vmovdqu %%xmm2,(%2) \n" // G - - "lea 64(%0),%0 \n" - "lea 16(%1),%1 \n" - "lea 16(%2),%2 \n" - "lea 16(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskARGBSplit), // %5 - "m"(kShuffleMaskARGBPermute) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_MERGEXR30ROW_AVX2 -void MergeXR30Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - int shift = depth - 10; - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants - "vpsrlw $14,%%ymm5,%%ymm5 \n" - "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits - "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrlw $6,%%ymm6,%%ymm6 \n" - "vmovd %5,%%xmm4 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1),%%ymm1 \n" - "vmovdqu (%0,%2),%%ymm2 \n" - "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" - "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" - "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" - "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" - "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit - "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB - "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" - "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG - "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" - "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit - "vpslld $0xa,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine - "vpor %%ymm2,%%ymm3,%%ymm3 \n" - "vmovdqu %%ymm0,(%3) \n" - "vmovdqu %%ymm3,0x20(%3) \n" - "lea 0x20(%0),%0 \n" - "lea 0x40(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 -#if defined(__i386__) - : "m"(shift) // %5 -#else - : "rm"(shift) // %5 -#endif - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_MERGEAR64ROW_AVX2 -static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; -void MergeAR64Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - mask = (mask << 16) + mask; - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "vmovdqa %8,%%ymm5 \n" - "vmovd %6,%%xmm6 \n" - "vbroadcastss %7,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // R - "vmovdqu (%0,%1),%%ymm1 \n" // G - "vmovdqu (%0,%2),%%ymm2 \n" // B - "vmovdqu (%0,%3),%%ymm3 \n" // A - "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" - "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" - "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" - "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" - "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" - "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" - "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" - "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" - "vpermd %%ymm0,%%ymm5,%%ymm0 \n" - "vpermd %%ymm1,%%ymm5,%%ymm1 \n" - "vpermd %%ymm2,%%ymm5,%%ymm2 \n" - "vpermd %%ymm3,%%ymm5,%%ymm3 \n" - "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) - "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) - "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) - "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) - "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) - "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) - "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) - "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) - "vmovdqu %%ymm3,(%4) \n" - "vmovdqu %%ymm2,0x20(%4) \n" - "vmovdqu %%ymm4,0x40(%4) \n" - "vmovdqu %%ymm1,0x60(%4) \n" - "lea 0x20(%0),%0 \n" - "lea 0x80(%4),%4 \n" - "subl $0x10,%5 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_ar64), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(shift), // %6 - "m"(mask), // %7 - "m"(MergeAR64Permute) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_MERGEXR64ROW_AVX2 -void MergeXR64Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - mask = (mask << 16) + mask; - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "vmovdqa %7,%%ymm5 \n" - "vmovd %5,%%xmm6 \n" - "vbroadcastss %6,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // R - "vmovdqu (%0,%1),%%ymm1 \n" // G - "vmovdqu (%0,%2),%%ymm2 \n" // B - "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" - "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" - "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" - "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" - "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" - "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" - "vpermd %%ymm0,%%ymm5,%%ymm0 \n" - "vpermd %%ymm1,%%ymm5,%%ymm1 \n" - "vpermd %%ymm2,%%ymm5,%%ymm2 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) - "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) - "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) - "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) - "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) - "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) - "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) - "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) - "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) - "vmovdqu %%ymm3,(%3) \n" - "vmovdqu %%ymm2,0x20(%3) \n" - "vmovdqu %%ymm4,0x40(%3) \n" - "vmovdqu %%ymm1,0x60(%3) \n" - "lea 0x20(%0),%0 \n" - "lea 0x80(%3),%3 \n" - "subl $0x10,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar64), // %3 - "+r"(width) // %4 - : "m"(shift), // %5 - "m"(mask), // %6 - "m"(MergeAR64Permute) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_MERGEARGB16TO8ROW_AVX2 -static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, - 4, 12, 5, 13, 6, 14, 7, 15}; -void MergeARGB16To8Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - int shift = depth - 8; - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "vbroadcastf128 %7,%%ymm5 \n" - "vmovd %6,%%xmm6 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // R - "vmovdqu (%0,%1),%%ymm1 \n" // G - "vmovdqu (%0,%2),%%ymm2 \n" // B - "vmovdqu (%0,%3),%%ymm3 \n" // A - "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" - "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" - "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" - "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) - "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) - "vmovdqu %%ymm2,(%4) \n" - "vmovdqu %%ymm0,0x20(%4) \n" - "lea 0x20(%0),%0 \n" - "lea 0x40(%4),%4 \n" - "subl $0x10,%5 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(shift), // %6 - "m"(MergeARGB16To8Shuffle) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 -void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - int shift = depth - 8; - asm volatile( - - "sub %0,%1 \n" - "sub %0,%2 \n" - "vbroadcastf128 %6,%%ymm5 \n" - "vmovd %5,%%xmm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // R - "vmovdqu (%0,%1),%%ymm1 \n" // G - "vmovdqu (%0,%2),%%ymm2 \n" // B - "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" - "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) - "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) - "vmovdqu %%ymm2,(%3) \n" - "vmovdqu %%ymm0,0x20(%3) \n" - "lea 0x20(%0),%0 \n" - "lea 0x40(%3),%3 \n" - "subl $0x10,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : "m"(shift), // %5 - "m"(MergeARGB16To8Shuffle) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" - - LABELALIGN - "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_COPYROW_SSE2 - -#ifdef HAS_COPYROW_AVX -void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_COPYROW_AVX - -#ifdef HAS_COPYROW_ERMS -// Multiple of 1. -void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { - size_t width_tmp = (size_t)(width); - asm volatile( - - "rep movsb \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc"); -} -#endif // HAS_COPYROW_ERMS - -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -// width in pixels -void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "vmovdqu 0x20(%0),%%ymm2 \n" - "lea 0x40(%0),%0 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_ARGBCOPYALPHAROW_AVX2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -static const uvec8 kShuffleAlphaShort_AVX2 = { - 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, - 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; - -void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - "vmovdqa %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0), %%ymm0 \n" - "vmovdqu 0x20(%0), %%ymm1 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu 0x40(%0), %%ymm2 \n" - "vmovdqu 0x60(%0), %%ymm3 \n" - "lea 0x80(%0), %0 \n" - "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20, %2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : "m"(kPermdARGBToY_AVX), // %3 - "m"(kShuffleAlphaShort_AVX2) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -// width in pixels -void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - - LABELALIGN - "1: \n" - "vpmovzxbd (%0),%%ymm1 \n" - "vpmovzxbd 0x8(%0),%%ymm2 \n" - "lea 0x10(%0),%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1,(%1) \n" - "vmovdqu %%ymm2,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 - -#ifdef HAS_SETROW_X86 -void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { - size_t width_tmp = (size_t)(width >> 2); - const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile( - - "rep stosl \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); -} - -void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { - size_t width_tmp = (size_t)(width); - asm volatile( - - "rep stosb \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); -} - -void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { - size_t width_tmp = (size_t)(width); - asm volatile( - - "rep stosl \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); -} -#endif // HAS_SETROW_X86 - -#ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_YUY2TOYROW_SSE2 - -#ifdef HAS_YUY2TOYROW_AVX2 -void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1,(%1) \n" - "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1,(%1) \n" - "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1,(%1) \n" - "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1,(%1) \n" - "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_YUY2TOYROW_AVX2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; - -// Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBBLENDROW_SSSE3 - -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - ::"memory", - "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); -} -#endif // HAS_BLENDPLANEROW_SSSE3 - -#ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 32 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_AVX2(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" - "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" - "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 32 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - ::"memory", - "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_BLENDPLANEROW_AVX2 - -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, - 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; -static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; -// Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBATTENUATEROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; -// Attenuate 8 pixels at a time. -void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uintptr_t alpha; - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 - -#ifdef HAS_ARGBUNATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; -// Unattenuate 8 pixels at a time. -void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uintptr_t alpha; - asm volatile( - "sub %0,%1 \n" - "vbroadcastf128 %5,%%ymm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - // replace VPGATHER - "movzb 0x03(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x07(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "movzb 0x13(%0),%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - "vmovd 0x00(%4,%3,4),%%xmm0 \n" - "movzb 0x17(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm1 \n" - "movzb 0x1b(%0),%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - "vmovd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x1f(%0),%3 \n" - "vmovd 0x00(%4,%3,4),%%xmm3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" - "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" - "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" - "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" - // end of VPGATHER - - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8), // %4 - "m"(kUnattenShuffleAlpha_AVX2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBUNATTENUATEROW_AVX2 - -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psubb %%xmm5,%%xmm0 \n" - "psubb %%xmm5,%%xmm1 \n" - "movdqu %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "movdqu %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "phaddw %%xmm0,%%xmm6 \n" - "paddw %%xmm5,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm6,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -// Constant for ARGB color to sepia tone -static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, - 17, 68, 35, 0, 17, 68, 35, 0}; - -static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, - 22, 88, 45, 0, 22, 88, 45, 0}; - -static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, - 24, 98, 50, 0, 24, 98, 50, 0}; - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBSEPIAROW_SSSE3 - -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 - -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - - "pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_ARGBMULTIPLYROW_AVX2 - -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBADDROW_SSE2 - -#ifdef HAS_ARGBADDROW_AVX2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpaddusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0"); -} -#endif // HAS_ARGBADDROW_AVX2 - -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 - -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -// Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpsubusb (%1),%%ymm0,%%ymm0 \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0"); -} -#endif // HAS_ARGBSUBTRACTROW_AVX2 - -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELXROW_SSE2 - -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELYROW_SSE2 - -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELROW_SSE2 - -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop. - LABELALIGN - "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" - - // 4 pixel small loop. - LABELALIGN - "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((intptr_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 -// Copy ARGB pixels from source image with slope to a row of destination. -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width) { - intptr_t src_argb_stride_temp = src_argb_stride; - intptr_t temp; - asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBAFFINEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_AVX2 -// Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - asm volatile( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" - "vbroadcastss %%xmm4,%%ymm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "vmovdqu (%1),%%ymm0 \n" - "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "rep movsb \n" - "jmp 999f \n" - - "99: \n" - "vzeroupper \n" - "999: \n" - : "+D"(dst_ptr), // %0 - "+S"(src_ptr), // %1 - "+cm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); -} -#endif // HAS_INTERPOLATEROW_AVX2 - -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile( - - "movdqu (%3),%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_ARGBSHUFFLEROW_SSSE3 - -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile( - - "vbroadcastf128 (%3),%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_ARGBSHUFFLEROW_AVX2 - -#ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile( - - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOYUY2ROW_SSE2 - -#ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile( - - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOUYVYROW_SSE2 - -#ifdef HAS_I422TOYUY2ROW_AVX2 -void I422ToYUY2Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile( - - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOYUY2ROW_AVX2 - -#ifdef HAS_I422TOUYVYROW_AVX2 -void I422ToUYVYRow_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile( - - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOUYVYROW_AVX2 - -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - asm volatile( - - "pxor %%xmm3,%%xmm3 \n" - - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 - -#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - asm volatile( - "vbroadcastf128 (%3),%%ymm4 \n" - "vbroadcastf128 0x10(%3),%%ymm5 \n" - "vbroadcastf128 0x20(%3),%%ymm6 \n" - "vbroadcastf128 0x30(%3),%%ymm7 \n" - - // 2 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels - "lea 0x8(%0),%0 \n" - "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats - "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X - "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X - "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X - "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X - "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * - // X - "vcvttps2dq %%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "vmovq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBPOLYNOMIALROW_AVX2 - -#ifdef HAS_HALFFLOATROW_SSE2 -static float kScaleBias = 1.9259299444e-34f; -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - scale *= kScaleBias; - asm volatile( - "movd %3,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" // 8 shorts - "add $0x10,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 - "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm3 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "psrld $0xd,%%xmm2 \n" - "psrld $0xd,%%xmm3 \n" - "packssdw %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,-0x10(%0,%1,1) \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(scale) // %3 - : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_HALFFLOATROW_SSE2 - -#ifdef HAS_HALFFLOATROW_AVX2 -void HalfFloatRow_AVX2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - scale *= kScaleBias; - asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm2 \n" // 16 shorts - "add $0x20,%0 \n" - "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates - "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vpsrld $0xd,%%ymm3,%%ymm3 \n" - "vpsrld $0xd,%%ymm2,%%ymm2 \n" - "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" - "sub $0x10,%2 \n" - "jg 1b \n" - - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 -#if defined(__x86_64__) - : "x"(scale) // %3 -#else - : "m"(scale) // %3 -#endif - : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_HALFFLOATROW_AVX2 - -#ifdef HAS_HALFFLOATROW_F16C -void HalfFloatRow_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - asm volatile( - "vbroadcastss %3, %%ymm4 \n" - "sub %0,%1 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints - "vpmovzxwd 0x10(%0),%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vmulps %%ymm2,%%ymm4,%%ymm2 \n" - "vmulps %%ymm3,%%ymm4,%%ymm3 \n" - "vcvtps2ph $3, %%ymm2, %%xmm2 \n" - "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - "vmovdqu %%xmm2,0x00(%0,%1,1) \n" - "vmovdqu %%xmm3,0x10(%0,%1,1) \n" - "add $0x20,%0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 -#if defined(__x86_64__) - : "x"(scale) // %3 -#else - : "m"(scale) // %3 -#endif - : "memory", "cc", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_HALFFLOATROW_F16C - -#ifdef HAS_HALFFLOATROW_F16C -void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { - asm volatile( - "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints - "vpmovzxwd 0x10(%0),%%ymm3 \n" - "vcvtdq2ps %%ymm2,%%ymm2 \n" - "vcvtdq2ps %%ymm3,%%ymm3 \n" - "vcvtps2ph $3, %%ymm2, %%xmm2 \n" - "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - "vmovdqu %%xmm2,0x00(%0,%1,1) \n" - "vmovdqu %%xmm3,0x10(%0,%1,1) \n" - "add $0x20,%0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm2", "xmm3"); -} -#endif // HAS_HALFFLOATROW_F16C - -#ifdef HAS_ARGBCOLORTABLEROW_X86 -// Tranform ARGB pixels with color table. -void ARGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - uintptr_t pixel_temp; - asm volatile( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "movzb -0x1(%0),%1 \n" - "movzb 0x03(%3,%1,4),%1 \n" - "mov %b1,-0x1(%0) \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - -#ifdef HAS_RGBCOLORTABLEROW_X86 -// Tranform RGB pixels with color table. -void RGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - uintptr_t pixel_temp; - asm volatile( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb (%0),%1 \n" - "lea 0x4(%0),%0 \n" - "movzb 0x00(%3,%1,4),%1 \n" - "mov %b1,-0x4(%0) \n" - "movzb -0x3(%0),%1 \n" - "movzb 0x01(%3,%1,4),%1 \n" - "mov %b1,-0x3(%0) \n" - "movzb -0x2(%0),%1 \n" - "movzb 0x02(%3,%1,4),%1 \n" - "mov %b1,-0x2(%0) \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); -} -#endif // HAS_RGBCOLORTABLEROW_X86 - -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - uintptr_t pixel_temp; - uintptr_t table_temp; - asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 - -#ifdef HAS_NV21TOYUV24ROW_AVX2 - -// begin NV21ToYUV24Row_C avx2 constants -static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00}; - -static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80}; - -static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00}; - -static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05, - 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05}; - -static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, - 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80}; - -static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, - 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f}; - -static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80, - 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80}; - -static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, - 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a}; - -static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, - 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80}; - -// NV21ToYUV24Row_AVX2 -void NV21ToYUV24Row_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - uint8_t* src_y_ptr; - uint64_t src_offset = 0; - uint64_t width64; - - width64 = width; - src_y_ptr = (uint8_t*)src_y; - - asm volatile( - "vmovdqu %5, %%ymm0 \n" // init blend value - "vmovdqu %6, %%ymm1 \n" // init blend value - "vmovdqu %7, %%ymm2 \n" // init blend value - // "sub $0x20, %3 \n" //sub 32 from - // width for final loop - - LABELALIGN - "1: \n" // label 1 - "vmovdqu (%0,%4), %%ymm3 \n" // src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 - "vmovdqu (%1), %%ymm5 \n" // src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for - // shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for - // shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for - // shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results - "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results - "add $0x20, %4 \n" // add to src buffer - // ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert - "vmovdqu %%ymm4, (%2) \n" // store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h - "add $0x60,%2 \n" // add to dst buffer - // ptr - // "cmp %3, %4 \n" //(width64 - - // 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop - "jg 1b \n" - "vzeroupper \n" // sse-avx2 - // transistions - - : "+r"(src_y), //%0 - "+r"(src_vu), //%1 - "+r"(dst_yuv24), //%2 - "+r"(width64), //%3 - "+r"(src_offset) //%4 - : "m"(kBLEND0), //%5 - "m"(kBLEND1), //%6 - "m"(kBLEND2), //%7 - "m"(kSHUF0), //%8 - "m"(kSHUF1), //%9 - "m"(kSHUF2), //%10 - "m"(kSHUF3), //%11 - "m"(kSHUF4), //%12 - "m"(kSHUF5) //%13 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", - "xmm13", "xmm14", "xmm15"); -} -#endif // HAS_NV21TOYUV24ROW_AVX2 - -#ifdef HAS_SWAPUVROW_SSSE3 - -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, - 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( - - "movdqu %3,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_SWAPUVROW_SSSE3 - -#ifdef HAS_SWAPUVROW_AVX2 -void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( - - "vbroadcastf128 %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_SWAPUVROW_AVX2 - -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // load 16 U values - "movdqu (%1),%%xmm1 \n" // load 16 V values - "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row - "movdqu 0(%1,%5,1),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // half size - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x10(%1),%1 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" // store 8 UV pixels - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" // 16 src pixels per loop - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride_u)), // %4 - "r"((intptr_t)(src_stride_v)) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void HalfMergeUVRow_AVX2(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // load 32 U values - "vmovdqu (%1),%%ymm1 \n" // load 32 V values - "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row - "vmovdqu 0(%1,%5,1),%%ymm3 \n" - "lea 0x20(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels - "lea 0x20(%2),%2 \n" - "sub $0x20,%3 \n" // 32 src pixels per loop - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride_u)), // %4 - "r"((intptr_t)(src_stride_v)) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm volatile( - "pxor %%xmm1,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" // load float - "maxss %%xmm1, %%xmm0 \n" // clamp to zero - "add 4, %0 \n" - "movd %%xmm0, (%1) \n" // store float - "add 4, %1 \n" - "sub $0x4,%2 \n" // 1 float per loop - "jg 1b \n" - : "+r"(src_x), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -#endif // defined(__x86_64__) || defined(__i386__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_mmi.cc b/thirdparty/libyuv/source/row_mmi.cc deleted file mode 100644 index 362fd1c..0000000 --- a/thirdparty/libyuv/source/row_mmi.cc +++ /dev/null @@ -1,7842 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "libyuv/row.h" - -#include // For memcpy and memset. - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - const uint64_t mask = 0xff000000ULL; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [mask] "f"(mask) - : "memory"); -} - -void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - uint64_t src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xff000000ULL; - const uint64_t mask2 = 0xc6; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) - : "memory"); -} - -void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x6c; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" - "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" - "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[src1], %[src1], %[zero] \n\t" - "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" - "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pextrh %[ftmp2], %[src1], %[zero] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" - - "daddiu %[src_raw], %[src_raw], 0x0c \n\t" - "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) - : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) - : "memory"); -} - -void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[5]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[c1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) - : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), - [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - uint64_t c4 = 0x0001000100010001; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psrlh %[a], %[src1], %[seven] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "xor %[a], %[a], %[c1] \n\t" - "paddb %[a], %[a], %[c4] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psrlh %[a], %[src1], %[four] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "psllh %[src0], %[a], %[four] \n\t" - "or %[a], %[src0], %[a] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), - [four] "f"(0x04) - : "memory"); -} - -void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) - : "memory"); -} - -void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - - "pextrh %[src0], %[ftmp1], %[two] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" - "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" - - "pextrh %[src0], %[ftmp2], %[two] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[one] \n\t" - "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[zero] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pinsrh_0 %[src1], %[src1], %[src0] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02) - : "memory"); -} - -void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), - [eleven] "f"(0x0b) - : "memory"); -} - -// dither4 is a row of 4 values from 4x4 dither matrix. -// The 4x4 matrix contains values to increase RGB. When converting to -// fewer bits (565) this provides an ordered dither. -// The order in the 4x4 matrix in first byte is upper left. -// The 4 values are passed as an int, then referenced as an array, so -// endian will not affect order of the original matrix. But the dither4 -// will containing the first pixel in the lower byte for little endian -// or the upper byte for big endian. -void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t c0 = 0x00ff00ff00ff00ff; - - __asm__ volatile( - "punpcklbh %[dither], %[dither], %[zero] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "paddh %[b], %[b], %[dither] \n\t" - "paddh %[g], %[g], %[dither] \n\t" - "paddh %[r], %[r], %[dither] \n\t" - "pcmpgth %[src0], %[b], %[c0] \n\t" - "or %[src0], %[src0], %[b] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[g], %[c0] \n\t" - "or %[src0], %[src0], %[g] \n\t" - "and %[g], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[r], %[c0] \n\t" - "or %[src0], %[src0], %[r] \n\t" - "and %[r], %[src0], %[c0] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), - [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) - : "memory"); -} - -void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[three] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - "psrlh %[a], %[a], %[seven] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[ten] \n\t" - "psllh %[a], %[a], %[fifteen] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), - [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) - : "memory"); -} - -void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[four] \n\t" - "psrlh %[g], %[g], %[four] \n\t" - "psrlh %[r], %[r], %[four] \n\t" - "psrlh %[a], %[a], %[four] \n\t" - - "psllh %[g], %[g], %[four] \n\t" - "psllh %[r], %[r], %[eight] \n\t" - "psllh %[a], %[a], %[twelve] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), - [twelve] "f"(0x0c) - : "memory"); -} - -void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ARGBToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0019008100420001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void BGRAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ABGRToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002F00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0042008100190001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGBAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" - "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" - "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" - "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" - "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGB24ToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RAWToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest, dest0, dest1, dest2, dest3; - uint64_t tmp0, tmp1; - const uint64_t shift = 0x08; - const uint64_t value = 0x80; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x0001004D0096001DULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest2], %[dest2], %[shift] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest3], %[dest3], %[shift] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), - [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), - [width] "r"(width) - : "memory"); -} - -void ARGBToUVJRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0015002a003f0002; - const uint64_t mask_v = 0x0002003f0035000a; - - __asm__ volatile( - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), - [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x1080108010801080; - uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) - : "memory"); -} - -void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest0_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest1_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest2_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest2_v], %[src0], %[c2] \n\t" - "psllh %[dest2_v], %[dest2_v], %[three] \n\t" - "or %[dest2_v], %[src1], %[dest2_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest3_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest3_v], %[src0], %[c2] \n\t" - "psllh %[dest3_v], %[dest3_v], %[three] \n\t" - "or %[dest3_v], %[src1], %[dest3_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" - "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [one] "f"(0x01) - : "memory"); -} - -void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest0_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest1_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest2_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest3_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" - "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" - "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), - [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [two] "f"(0x02), [one] "f"(0x01) - : "memory"); -} - -void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest0_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest0_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest1_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest1_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest2_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest2_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest3_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest3_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" - "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_argb4444] "r"(src_argb4444), - [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), - [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), - [two] "f"(0x02) - : "memory"); -} - -void ARGBToUV444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), - [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), - [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), - [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), - [dest3_v] "=&f"(ftmp[11]) - : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), - [eight] "f"(0x08) - : "memory"); -} - -void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x0080004D0096001DULL; - const uint64_t mask3 = 0xFF000000FF000000ULL; - const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "and %[src37], %[src], %[mask3] \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" - "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" - "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" - "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" - "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask4] \n\t" - "or %[dest], %[dest], %[src37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), - [src37] "=&f"(src37) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) - : "memory"); -} - -// Convert a row of image to Sepia tone. -void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { - uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x002300440011ULL; - const uint64_t mask2 = 0x002D00580016ULL; - const uint64_t mask3 = 0x003200620018ULL; - const uint64_t mask4 = 0xFF000000FF000000ULL; - const uint64_t shift = 0x07; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[dest37], %[dest], %[mask4] \n\t" - - "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "or %[dest], %[dest], %[dest37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), - [dest] "=&f"(dest) - : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [shift] "f"(shift) - : "memory"); -} - -// Apply color matrix to a row of image. Matrix is signed. -// TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, - dest3; - uint64_t matrix, matrix_hi, matrix_lo; - uint64_t tmp0, tmp1; - const uint64_t shift0 = 0x06; - const uint64_t shift1 = 0x08; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psraw %[dest0], %[dest0], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psraw %[dest1], %[dest1], %[shift0] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psraw %[dest2], %[dest2], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psraw %[dest3], %[dest3], %[shift0] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), - [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) - : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), - [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) - : "memory"); -} - -void ARGBShadeRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "punpcklbh %[value], %[value], %[value] \n\t" - - "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), - [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [value] "f"(value), [shift] "f"(shift) - : "memory"); -} - -void ARGBMultiplyRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; - uint64_t dest, dest_lo, dest_hi; - const uint64_t mask = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" - - "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" - "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -void ARGBAddRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "paddusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -void ARGBSubtractRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "psubusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -// Sobel functions which mimics SSSE3. -void SobelXRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - uint64_t y00 = 0, y10 = 0, y20 = 0; - uint64_t y02 = 0, y12 = 0, y22 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] - "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] - "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" // a+b - "paddh %[y20], %[y20], %[y10] \n\t" // c+b - "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c - - "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub - "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub - "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[sobel], %[y10], %[y20] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" - "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" - "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" - "paddh %[y20], %[y20], %[y10] \n\t" - "paddh %[y00], %[y00], %[y20] \n\t" - - "paddh %[y02], %[y02], %[y12] \n\t" - "paddh %[y22], %[y22], %[y12] \n\t" - "paddh %[y02], %[y02], %[y22] \n\t" - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[y00], %[y10], %[y20] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[src_y2], %[src_y2], 8 \n\t" - "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), - [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), - [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelYRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - uint64_t y00 = 0, y01 = 0, y02 = 0; - uint64_t y10 = 0, y11 = 0, y12 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] - "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] - "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" // a+b - "paddh %[y02], %[y02], %[y01] \n\t" // c+b - "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c - - "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub - "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub - "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[sobel], %[y02], %[y12] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" - "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" - "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" - "paddh %[y02], %[y02], %[y01] \n\t" - "paddh %[y00], %[y00], %[y02] \n\t" - - "paddh %[y10], %[y10], %[y11] \n\t" - "paddh %[y12], %[y12], %[y11] \n\t" - "paddh %[y10], %[y10], %[y12] \n\t" - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[y00], %[y02], %[y12] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), - [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), - [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - double temp[3]; - uint64_t c1 = 0xff000000ff000000; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] - "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" - // s7 s6 s5 s4 s3 s2 s1 s0 = a+b - "paddusb %[t2] , %[t0], %[t1] \n\t" - - // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 - "punpcklbh %[t0], %[t2], %[t2] \n\t" - - // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s1 s1 s1 s55 s0 s0 s0 - "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" - - // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s3 s3 s3 255 s2 s2 s2 - "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" - - // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 - "punpckhbh %[t0], %[t2], %[t2] \n\t" - - // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" - - // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - uint64_t tr = 0; - uint64_t tb = 0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" - "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" - "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] - "paddusb %[tr], %[tr], %[tb] \n\t" // g - "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" - - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(tr), [tb] "=&f"(tb) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_y] "r"(dst_y), [width] "r"(width) - : "memory"); -} - -void SobelXYRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - uint64_t temp[3]; - uint64_t result = 0; - uint64_t gb = 0; - uint64_t cr = 0; - uint64_t c1 = 0xffffffffffffffff; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" - "paddusb %[tg] , %[tr], %[tb] \n\t" // g - - // g3 b3 g2 b2 g1 b1 g0 b0 - "punpcklbh %[gb], %[tb], %[tg] \n\t" - // c3 r3 r2 r2 c1 r1 c0 r0 - "punpcklbh %[cr], %[tr], %[c1] \n\t" - // c1 r1 g1 b1 c0 r0 g0 b0 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" - // c3 r3 g3 b3 c2 r2 g2 b2 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" - - // g7 b7 g6 b6 g5 b5 g4 b4 - "punpckhbh %[gb], %[tb], %[tg] \n\t" - // c7 r7 c6 r6 c5 r5 c4 r4 - "punpckhbh %[cr], %[tr], %[c1] \n\t" - // c5 r5 g5 b5 c4 r4 g4 b4 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" - // c7 r7 g7 b7 c6 r6 g6 b6 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), - [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { - // Copy a Y to RGB. - uint64_t src, dest; - const uint64_t mask0 = 0x00ffffff00ffffffULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src], %[src], %[src] \n\t" - "punpcklhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -// TODO - respect YuvConstants -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, - const struct YuvConstants*, int width) { - uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x55; - const uint64_t mask2 = 0xAA; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = 0x4A354A354A354A35ULL; - const uint64_t mask5 = 0x0488048804880488ULL; - const uint64_t shift0 = 0x08; - const uint64_t shift1 = 0x06; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "pshufh %[src], %[src_lo], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_lo], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), - [shift1] "f"(shift1), [width] "r"(width) - : "memory"); -} - -void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x1b; - - src += width - 1; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[source], 0(%[src_ptr]) \n\t" - "gsldrc1 %[source], -7(%[src_ptr]) \n\t" - "punpcklbh %[src0], %[source], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask1] \n\t" - "punpckhbh %[src1], %[source], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "packushb %[dest], %[src1], %[src0] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void MirrorSplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src0, src1, dest0, dest1; - const uint64_t mask0 = 0x00ff00ff00ff00ffULL; - const uint64_t mask1 = 0x1b; - const uint64_t shift = 0x08; - - src_uv += (width - 1) << 1; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" - "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" - "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" - "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" - - "and %[dest0], %[src0], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "and %[dest1], %[src1], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" - - "psrlh %[dest0], %[src0], %[shift] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "psrlh %[dest1], %[src1], %[shift] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" - "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" - "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), - [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [shift] "f"(shift) - : "memory"); -} - -void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - src += (width - 1) * 4; - uint64_t temp = 0x0; - uint64_t shuff = 0x4e; // 01 00 11 10 - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[temp], 3(%[src]) \n\t" - "gsldrc1 %[temp], -4(%[src]) \n\t" - "pshufh %[temp], %[temp], %[shuff] \n\t" - "gssdrc1 %[temp], 0x0(%[dst]) \n\t" - "gssdlc1 %[temp], 0x7(%[dst]) \n\t" - - "daddiu %[src], %[src], -0x08 \n\t" - "daddiu %[dst], %[dst], 0x08 \n\t" - "daddiu %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [temp] "=&f"(temp) - : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) - : "memory"); -} - -void SplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" - - "and %[t2], %[t0], %[c0] \n\t" - "and %[t3], %[t1], %[c0] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" - - "psrlh %[t2], %[t0], %[shift] \n\t" - "psrlh %[t3], %[t1], %[shift] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" - - "daddiu %[src_uv], %[src_uv], 16 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [t3] "=&f"(temp[3]) - : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -void MergeUVRow_MMI(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - uint64_t temp[3]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" - "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" - "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" - "punpcklbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" - "punpckhbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" - - "daddiu %[src_u], %[src_u], 8 \n\t" - "daddiu %[src_v], %[src_v], 8 \n\t" - "daddiu %[dst_uv], %[dst_uv], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), - [width] "r"(width) - : "memory"); -} - -void SplitRGBRow_MMI(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - uint64_t src[4]; - uint64_t dest_hi, dest_lo, dest; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" - "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" - - "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" - "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" - "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" - "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), - [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), - [dstb_ptr] "r"(dst_b), [width] "r"(width) - : "memory"); -} - -void MergeRGBRow_MMI(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - uint64_t srcr, srcg, srcb, dest; - uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; - const uint64_t temp = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" - "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" - "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" - "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" - "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" - "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" - - "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" - "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" - "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" - "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" - - "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" - - "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" - "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" - "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), - [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), - [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), - [srcbz_lo] "=&f"(srcbz_lo) - : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), - [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) - : "memory"); -} - -// Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0) - : "memory"); -} - -// Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_MMI(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - // Output a row of Y values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t shift = 0x08; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "dsrl %[t0], %[t0], %[shift] \n\t" - "dsrl %[t1], %[t1], %[shift] \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Blend src_argb over src_argb1 and store to dst_argb. -// dst_argb may be src_argb or src_argb1. -// This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, - dest_lo; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_lo] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" - - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_hi] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[mask4] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void BlendPlaneRow_MMI(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - uint64_t source0, source1, dest, alph; - uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, - dest_lo; - uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" - "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" - "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" - "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" - "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" - "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" - "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" - - "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" - "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" - "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" - "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - - "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" - "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" - "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" - "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), - [alpha_r] "=&f"(alpha_rev) - : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), - [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -// Multiply source RGB by alpha and store to destination. -// This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; - const uint64_t mask0 = 0xFF; - const uint64_t mask1 = 0xFF000000FF000000ULL; - const uint64_t mask2 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "pshufh %[alpha], %[src_lo], %[mask0] \n\t" - "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pshufh %[alpha], %[src_hi], %[mask0] \n\t" - "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask2] \n\t" - "and %[src], %[src], %[mask1] \n\t" - "or %[dest], %[dest], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), - [width] "r"(width) - : "memory"); -} - -void ComputeCumulativeSumRow_MMI(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - int64_t row_sum[2] = {0, 0}; - uint64_t src, dest0, dest1, presrc0, presrc1, dest; - const uint64_t mask = 0x0; - - __asm__ volatile( - "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" - "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" - - "1: \n\t" - "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" - - "punpcklbh %[src], %[src], %[mask] \n\t" - "punpcklhw %[dest0], %[src], %[mask] \n\t" - "punpckhhw %[dest1], %[src], %[mask] \n\t" - - "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" - "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" - - "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" - "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" - - "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" - "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" - "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), - [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), - [presrc1] "=&f"(presrc1) - : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), - [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -// C version 2x2 -> 2x1. -void InterpolateRow_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - if (source_y_fraction == 0) { - __asm__ volatile( - "1: \n\t" - "ld $t0, 0x0(%[src_ptr]) \n\t" - "sd $t0, 0x0(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : - : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width) - : "memory"); - return; - } - if (source_y_fraction == 128) { - uint64_t uv = 0x0; - uint64_t uv_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" - "daddu $t0, %[src_ptr], %[stride] \n\t" - "gsldrc1 %[uv_stride], 0x0($t0) \n\t" - "gsldlc1 %[uv_stride], 0x7($t0) \n\t" - - "pavgb %[uv], %[uv], %[uv_stride] \n\t" - "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [stride] "r"((int64_t)src_stride) - : "memory"); - return; - } - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint64_t temp; - uint64_t data[4]; - uint64_t zero = 0x0; - uint64_t c0 = 0x0080008000800080; - uint64_t fy0 = 0x0100010001000100; - uint64_t shift = 0x8; - __asm__ volatile( - "pshufh %[fy1], %[fy1], %[zero] \n\t" - "psubh %[fy0], %[fy0], %[fy1] \n\t" - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" - "punpcklbh %[d0], %[t0], %[zero] \n\t" - "punpckhbh %[d1], %[t0], %[zero] \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" - "punpcklbh %[d2], %[t0], %[zero] \n\t" - "punpckhbh %[d3], %[t0], %[zero] \n\t" - - "pmullh %[d0], %[d0], %[fy0] \n\t" - "pmullh %[d2], %[d2], %[fy1] \n\t" - "paddh %[d0], %[d0], %[d2] \n\t" - "paddh %[d0], %[d0], %[c0] \n\t" - "psrlh %[d0], %[d0], %[shift] \n\t" - - "pmullh %[d1], %[d1], %[fy0] \n\t" - "pmullh %[d3], %[d3], %[fy1] \n\t" - "paddh %[d1], %[d1], %[d3] \n\t" - "paddh %[d1], %[d1], %[c0] \n\t" - "psrlh %[d1], %[d1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d1] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), - [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), - [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0), - [shift] "f"(shift), [zero] "f"(zero) - : "memory"); -} - -// Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | - ((shuffler[2] & 0x03) << 4) | - ((shuffler[3] & 0x03) << 6); - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[src], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "punpckhbh %[dest1], %[src], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest], %[dest0], %[dest1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I422ToYUY2Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void I422ToUYVYRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest; - const uint64_t mask0 = 0xff000000ff000000ULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[src], %[src], %[mask0] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[src], %[dest] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; - const uint64_t mask = 0xff000000ff000000ULL; - const uint64_t shift = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00ffffff00ffffffULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "punpckhbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I444ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - __asm__ volatile ( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - "punpcklbh %[u], %[u], %[zero] \n\t"//u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - "punpcklbh %[v], %[v], %[zero] \n\t"//v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// Also used for 420 -void I422ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t"//v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// 10 bit YUV to ARGB -void I210ToARGBRow_MMI(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" - - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask), [two]"f"(0x02), - [mask1]"f"(0x00ff00ff00ff00ff) - : "memory" - ); -} - -void I422AlphaToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v,a; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), [a]"=&f"(a), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [a_ptr]"r"(src_a), [zero]"f"(0x00), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -void I422ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(mask), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void I422ToARGB4444Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" - - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), - [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), - [alpha]"f"(-1) - : "memory" - ); -} - -void I422ToARGB1555Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" - - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [mask3]"f"(0x800000008000), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void I422ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void NV12ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV21ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV12ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [lmove1]"f"(0x18), - [one]"f"(0x1), [rmove1]"f"(0x8) - : "memory" - ); -} - -void NV21ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void NV12ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7) - : "memory" - ); -} - -void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void I422ToRGBARow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [alpha]"f"(-1) - : "memory" - ); -} - -void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile ( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32]"+&f"(v32) - : [dst_ptr]"r"(dst_argb), [width]"r"(width) - : "memory" - ); -} -// clang-format on - -// 10 bit YUV to ARGB -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_msa.cc b/thirdparty/libyuv/source/row_msa.cc deleted file mode 100644 index c0b13b0..0000000 --- a/thirdparty/libyuv/source/row_msa.cc +++ /dev/null @@ -1,3620 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "libyuv/row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "libyuv/macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define ALPHA_VAL (-1) - -// Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ - { \ - ub = __msa_fill_w(yuvconst->kUVToB[0]); \ - vr = __msa_fill_w(yuvconst->kUVToR[1]); \ - ug = __msa_fill_w(yuvconst->kUVToG[0]); \ - vg = __msa_fill_w(yuvconst->kUVToG[1]); \ - bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ - bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ - br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ - yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ - } - -// Load YUV 422 pixel data -#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ - { \ - uint64_t y_m; \ - uint32_t u_m, v_m; \ - v4i32 zero_m = {0}; \ - y_m = LD(psrc_y); \ - u_m = LW(psrc_u); \ - v_m = LW(psrc_v); \ - out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ - out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ - out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ - } - -// Clip input vector elements between 0 to 255 -#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ - { \ - v4i32 max_m = __msa_ldi_w(0xFF); \ - \ - in0 = __msa_maxi_s_w(in0, 0); \ - in1 = __msa_maxi_s_w(in1, 0); \ - in2 = __msa_maxi_s_w(in2, 0); \ - in3 = __msa_maxi_s_w(in3, 0); \ - in4 = __msa_maxi_s_w(in4, 0); \ - in5 = __msa_maxi_s_w(in5, 0); \ - in0 = __msa_min_s_w(max_m, in0); \ - in1 = __msa_min_s_w(max_m, in1); \ - in2 = __msa_min_s_w(max_m, in2); \ - in3 = __msa_min_s_w(max_m, in3); \ - in4 = __msa_min_s_w(max_m, in4); \ - in5 = __msa_min_s_w(max_m, in5); \ - } - -// Convert 8 pixels of YUV 420 to RGB. -#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ - { \ - v8i16 vec0_m, vec1_m; \ - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ - v4i32 reg5_m, reg6_m, reg7_m; \ - v16i8 zero_m = {0}; \ - \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ - vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ - reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ - reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ - reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ - reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ - reg0_m *= yg; \ - reg1_m *= yg; \ - reg2_m *= ubvr; \ - reg3_m *= ubvr; \ - reg0_m = __msa_srai_w(reg0_m, 16); \ - reg1_m = __msa_srai_w(reg1_m, 16); \ - reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ - reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ - reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ - reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ - reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ - reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ - reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ - reg5_m = reg0_m - reg5_m; \ - reg6_m = reg1_m - reg6_m; \ - reg2_m = reg0_m - reg2_m; \ - reg3_m = reg1_m - reg3_m; \ - reg7_m = reg0_m - reg7_m; \ - reg4_m = reg1_m - reg4_m; \ - reg5_m += bb; \ - reg6_m += bb; \ - reg7_m += bg; \ - reg4_m += bg; \ - reg2_m += br; \ - reg3_m += br; \ - reg5_m = __msa_srai_w(reg5_m, 6); \ - reg6_m = __msa_srai_w(reg6_m, 6); \ - reg7_m = __msa_srai_w(reg7_m, 6); \ - reg4_m = __msa_srai_w(reg4_m, 6); \ - reg2_m = __msa_srai_w(reg2_m, 6); \ - reg3_m = __msa_srai_w(reg3_m, 6); \ - CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ - out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ - out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ - out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - } - -// Pack and Store 8 ARGB values. -#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ - { \ - v8i16 vec0_m, vec1_m; \ - v16u8 dst0_m, dst1_m; \ - vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ - dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ - dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ - ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ - } - -// Takes ARGB input and calculates Y. -#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ - y_out) \ - { \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8u16 reg0_m, reg1_m; \ - \ - vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ - vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ - vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ - vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ - reg0_m = __msa_dotp_u_h(vec0_m, const0); \ - reg1_m = __msa_dotp_u_h(vec1_m, const0); \ - reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ - reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ - reg0_m += const2; \ - reg1_m += const2; \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ - y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - } - -// Loads current and next row of ARGB input and averages it to calculate U and V -#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ - { \ - v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ - v8u16 reg8_m, reg9_m; \ - \ - src0_m = (v16u8)__msa_ld_b((void*)s, 0); \ - src1_m = (v16u8)__msa_ld_b((void*)s, 16); \ - src2_m = (v16u8)__msa_ld_b((void*)s, 32); \ - src3_m = (v16u8)__msa_ld_b((void*)s, 48); \ - src4_m = (v16u8)__msa_ld_b((void*)t, 0); \ - src5_m = (v16u8)__msa_ld_b((void*)t, 16); \ - src6_m = (v16u8)__msa_ld_b((void*)t, 32); \ - src7_m = (v16u8)__msa_ld_b((void*)t, 48); \ - vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ - vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ - vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ - vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ - vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ - vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ - vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ - vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ - reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ - reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ - reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ - reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ - reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ - reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ - reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ - reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ - reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m += const_0x0101; \ - reg9_m += const_0x0101; \ - reg0_m += const_0x0101; \ - reg1_m += const_0x0101; \ - argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ - argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ - argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ - argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ - } - -#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, shift, u_out, v_out) \ - { \ - v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ - \ - vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_w(vec0_m, const0); \ - reg1_m = __msa_dotp_u_w(vec1_m, const0); \ - reg2_m = __msa_dotp_u_w(vec4_m, const0); \ - reg3_m = __msa_dotp_u_w(vec5_m, const0); \ - reg0_m += const1; \ - reg1_m += const1; \ - reg2_m += const1; \ - reg3_m += const1; \ - reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ - reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ - reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ - reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ - reg0_m = __msa_srl_w(reg0_m, shift); \ - reg1_m = __msa_srl_w(reg1_m, shift); \ - reg2_m = __msa_srl_w(reg2_m, shift); \ - reg3_m = __msa_srl_w(reg3_m, shift); \ - u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ - v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - } - -// Takes ARGB input and calculates U and V. -#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, v_out, u_out) \ - { \ - v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ - \ - vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_w(vec0_m, const1); \ - reg1_m = __msa_dotp_u_w(vec1_m, const1); \ - reg2_m = __msa_dotp_u_w(vec4_m, const1); \ - reg3_m = __msa_dotp_u_w(vec5_m, const1); \ - reg0_m += (v4u32)const3; \ - reg1_m += (v4u32)const3; \ - reg2_m += (v4u32)const3; \ - reg3_m += (v4u32)const3; \ - reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ - reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ - reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ - reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ - u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ - u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ - v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ - } - -// Load I444 pixel data -#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ - { \ - uint64_t y_m, u_m, v_m; \ - v2i64 zero_m = {0}; \ - y_m = LD(psrc_y); \ - u_m = LD(psrc_u); \ - v_m = LD(psrc_v); \ - out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ - out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ - out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ - } - -void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - src += width - 64; - - for (x = 0; x < width; x += 64) { - LD_UB4(src, 16, src3, src2, src1, src0); - VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); - VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += 64; - src -= 64; - } -} - -void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - int x; - v8u16 src, dst; - v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; - src_uv += (width - 8) << 1; - for (x = 0; x < width; x += 8) { - src = LD_UH(src_uv); - dst = __msa_vshf_h(shuffler, src, src); - ST_UH(dst, dst_uv); - src_uv -= 16; - dst_uv += 16; - } -} - -void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; - src += width * 4 - 64; - - for (x = 0; x < width; x += 16) { - LD_UB4(src, 16, src3, src2, src1, src0); - VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); - VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += 64; - src -= 64; - } -} - -void I422ToYUY2Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - int x; - v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; - v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; - - for (x = 0; x < width; x += 32) { - src_u0 = LD_UB(src_u); - src_v0 = LD_UB(src_v); - LD_UB2(src_y, 16, src_y0, src_y1); - ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); - ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); - ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); - ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); - src_u += 16; - src_v += 16; - src_y += 32; - dst_yuy2 += 64; - } -} - -void I422ToUYVYRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - int x; - v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; - v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; - - for (x = 0; x < width; x += 32) { - src_u0 = LD_UB(src_u); - src_v0 = LD_UB(src_v); - LD_UB2(src_y, 16, src_y0, src_y1); - ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); - ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); - ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); - ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); - src_u += 16; - src_v += 16; - src_y += 32; - dst_uyvy += 64; - } -} - -void I422ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb += 32; - } -} - -void I422ToRGBARow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - STOREARGB(alpha, vec0, vec1, vec2, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb += 32; - } -} - -void I422AlphaToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - int64_t data_a; - v16u8 src0, src1, src2, src3; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v4i32 zero = {0}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - data_a = LD(src_a); - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); - STOREARGB(vec0, vec1, vec2, src3, dst_argb); - src_y += 8; - src_u += 4; - src_v += 4; - src_a += 8; - dst_argb += 32; - } -} - -void I422ToRGB24Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int32_t width) { - int x; - int64_t data_u, data_v; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 reg0, reg1, reg2, reg3; - v2i64 zero = {0}; - v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; - v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; - v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, - 11, 29, 12, 13, 30, 14, 15, 31}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); - data_u = LD(src_u); - data_v = LD(src_v); - src1 = (v16u8)__msa_insert_d(zero, 0, data_u); - src2 = (v16u8)__msa_insert_d(zero, 0, data_v); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); - src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec3, vec4, vec5); - reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); - reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); - reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); - ST_UB2(dst0, dst1, dst_argb, 16); - ST_UB(dst2, (dst_argb + 32)); - src_y += 16; - src_u += 8; - src_v += 8; - dst_argb += 48; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void I422ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec2, vec1); - vec0 = __msa_srai_h(vec0, 3); - vec1 = __msa_srai_h(vec1, 3); - vec2 = __msa_srai_h(vec2, 2); - vec1 = __msa_slli_h(vec1, 11); - vec2 = __msa_slli_h(vec2, 5); - vec0 |= vec1; - dst0 = (v16u8)(vec2 | vec0); - ST_UB(dst0, dst_rgb565); - src_y += 8; - src_u += 4; - src_v += 4; - dst_rgb565 += 16; - } -} - -// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. -void I422ToARGB4444Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - reg0 = (v8u16)__msa_srai_h(vec0, 4); - reg1 = (v8u16)__msa_srai_h(vec1, 4); - reg2 = (v8u16)__msa_srai_h(vec2, 4); - reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); - reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); - reg1 |= const_0xF000; - reg0 |= reg2; - dst0 = (v16u8)(reg1 | reg0); - ST_UB(dst0, dst_argb4444); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb4444 += 16; - } -} - -void I422ToARGB1555Row_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0; - v8i16 vec0, vec1, vec2; - v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - READYUV422(src_y, src_u, src_v, src0, src1, src2); - src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - reg0 = (v8u16)__msa_srai_h(vec0, 3); - reg1 = (v8u16)__msa_srai_h(vec1, 3); - reg2 = (v8u16)__msa_srai_h(vec2, 3); - reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); - reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); - reg1 |= const_0x8000; - reg0 |= reg2; - dst0 = (v16u8)(reg1 | reg0); - ST_UB(dst0, dst_argb1555); - src_y += 8; - src_u += 4; - src_v += 4; - dst_argb1555 += 16; - } -} - -void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_y, 16); - src_yuy2 += 64; - dst_y += 32; - } -} - -void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); - src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec0 = __msa_aver_u_b(src0, src2); - vec1 = __msa_aver_u_b(src1, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_yuy2 += 64; - src_yuy2_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_yuy2, 16, src0, src1, src2, src3); - src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_yuy2 += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_y, 16); - src_uyvy += 64; - dst_y += 32; - } -} - -void UYVYToUVRow_MSA(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); - src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec0 = __msa_aver_u_b(src0, src2); - vec1 = __msa_aver_u_b(src1, src3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_uyvy += 64; - src_uyvy_next += 64; - dst_u += 16; - dst_v += 16; - } -} - -void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - LD_UB4(src_uyvy, 16, src0, src1, src2, src3); - src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_uyvy += 64; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16i8 zero = {0}; - v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); - v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); - v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); - reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); - reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); - reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); - reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); - reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); - reg0 *= const_0x19; - reg1 *= const_0x19; - reg2 *= const_0x81; - reg3 *= const_0x81; - reg4 *= const_0x42; - reg5 *= const_0x42; - reg0 += reg2; - reg1 += reg3; - reg0 += reg4; - reg1 += reg5; - reg0 += const_0x1080; - reg1 += const_0x1080; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ARGBToUVRow_MSA(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* src_argb_next = src_argb + src_stride_argb; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v16u8 dst0, dst1; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); - reg0 = __msa_hadd_u_h(vec8, vec8); - reg1 = __msa_hadd_u_h(vec9, vec9); - reg2 = __msa_hadd_u_h(vec4, vec4); - reg3 = __msa_hadd_u_h(vec5, vec5); - reg4 = __msa_hadd_u_h(vec0, vec0); - reg5 = __msa_hadd_u_h(vec1, vec1); - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); - vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); - vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); - vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); - vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); - reg0 += __msa_hadd_u_h(vec8, vec8); - reg1 += __msa_hadd_u_h(vec9, vec9); - reg2 += __msa_hadd_u_h(vec4, vec4); - reg3 += __msa_hadd_u_h(vec5, vec5); - reg4 += __msa_hadd_u_h(vec0, vec0); - reg5 += __msa_hadd_u_h(vec1, vec1); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg4 += const_0x0001; - reg5 += const_0x0001; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); - reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); - reg6 = reg0 * const_0x70; - reg7 = reg1 * const_0x70; - reg8 = reg2 * const_0x4A; - reg9 = reg3 * const_0x4A; - reg6 += const_0x8080; - reg7 += const_0x8080; - reg8 += reg4 * const_0x26; - reg9 += reg5 * const_0x26; - reg0 *= const_0x12; - reg1 *= const_0x12; - reg2 *= const_0x5E; - reg3 *= const_0x5E; - reg4 *= const_0x70; - reg5 *= const_0x70; - reg2 += reg0; - reg3 += reg1; - reg4 += const_0x8080; - reg5 += const_0x8080; - reg6 -= reg8; - reg7 -= reg9; - reg4 -= reg2; - reg5 -= reg3; - reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); - reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_argb += 128; - src_argb_next += 128; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2; - v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; - v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, - 16, 17, 18, 20, 21, 22, 24, 25}; - v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, - 21, 22, 24, 25, 26, 28, 29, 30}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_rgb, 16); - ST_UB(dst2, (dst_rgb + 32)); - src_argb += 64; - dst_rgb += 48; - } -} - -void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2; - v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; - v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, - 18, 17, 16, 22, 21, 20, 26, 25}; - v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, - 21, 20, 26, 25, 24, 30, 29, 28}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_rgb, 16); - ST_UB(dst2, (dst_rgb + 32)); - src_argb += 64; - dst_rgb += 48; - } -} - -void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - int x; - v16u8 src0, src1, dst0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); - vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); - vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); - vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); - vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); - vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); - vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); - vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); - vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); - vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); - vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); - vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); - vec0 = __msa_binsli_b(vec0, vec1, 2); - vec1 = __msa_binsli_b(vec2, vec3, 4); - vec4 = __msa_binsli_b(vec4, vec5, 2); - vec5 = __msa_binsli_b(vec6, vec7, 4); - vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); - dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - v16u8 src0, src1, dst0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); - vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); - vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); - vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); - vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); - vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); - vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); - vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); - vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); - vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); - vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); - vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); - vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); - vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); - vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); - vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); - vec0 = __msa_binsli_b(vec0, vec1, 2); - vec5 = __msa_binsli_b(vec5, vec6, 2); - vec1 = __msa_binsli_b(vec2, vec3, 5); - vec6 = __msa_binsli_b(vec7, vec8, 5); - vec1 = __msa_binsli_b(vec1, vec4, 0); - vec6 = __msa_binsli_b(vec6, vec9, 0); - vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); - dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - int x; - v16u8 src0, src1; - v16u8 vec0, vec1; - v16u8 dst0; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); - vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); - src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); - src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); - vec0 = __msa_binsli_b(vec0, src0, 3); - vec1 = __msa_binsli_b(vec1, src1, 3); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBToUV444Row_MSA(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int32_t width) { - int32_t x; - v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 vec8, vec9, vec10, vec11; - v8u16 const_112 = (v8u16)__msa_ldi_h(112); - v8u16 const_74 = (v8u16)__msa_ldi_h(74); - v8u16 const_38 = (v8u16)__msa_ldi_h(38); - v8u16 const_94 = (v8u16)__msa_ldi_h(94); - v8u16 const_18 = (v8u16)__msa_ldi_h(18); - v8u16 const_32896 = (v8u16)__msa_fill_h(32896); - v16i8 zero = {0}; - - for (x = width; x > 0; x -= 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); - vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); - vec10 = vec0 * const_18; - vec11 = vec1 * const_18; - vec8 = vec2 * const_94; - vec9 = vec3 * const_94; - vec6 = vec4 * const_112; - vec7 = vec5 * const_112; - vec0 *= const_112; - vec1 *= const_112; - vec2 *= const_74; - vec3 *= const_74; - vec4 *= const_38; - vec5 *= const_38; - vec8 += vec10; - vec9 += vec11; - vec6 += const_32896; - vec7 += const_32896; - vec0 += const_32896; - vec1 += const_32896; - vec2 += vec4; - vec3 += vec5; - vec0 -= vec2; - vec1 -= vec3; - vec6 -= vec8; - vec7 -= vec9; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); - vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - src_argb += 64; - dst_u += 16; - dst_v += 16; - } -} - -void ARGBMultiplyRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, dst0; - v8u16 vec0, vec1, vec2, vec3; - v4u32 reg0, reg1, reg2, reg3; - v8i16 zero = {0}; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); - reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); - reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); - reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_argb); - src_argb += 16; - src_argb1 += 16; - dst_argb += 16; - } -} - -void ARGBAddRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - dst0 = __msa_adds_u_b(src0, src2); - dst1 = __msa_adds_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBSubtractRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - dst0 = __msa_subs_u_b(src0, src2); - dst1 = __msa_subs_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBAttenuateRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 zero = {0}; - v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); - vec4 = (v8u16)__msa_fill_h(vec0[3]); - vec5 = (v8u16)__msa_fill_h(vec0[7]); - vec6 = (v8u16)__msa_fill_h(vec1[3]); - vec7 = (v8u16)__msa_fill_h(vec1[7]); - vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - vec6 = (v8u16)__msa_fill_h(vec2[3]); - vec7 = (v8u16)__msa_fill_h(vec2[7]); - vec8 = (v8u16)__msa_fill_h(vec3[3]); - vec9 = (v8u16)__msa_fill_h(vec3[7]); - vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); - reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); - reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); - reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); - reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); - reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); - reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); - reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); - reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); - reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); - reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); - reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); - reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); - vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst0 = __msa_bmnz_v(dst0, src0, mask); - dst1 = __msa_bmnz_v(dst1, src1, mask); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, - uint8_t* dst_rgb, - uint32_t dither4, - int width) { - int x; - v16u8 src0, src1, dst0, vec0, vec1; - v8i16 vec_d0; - v8i16 reg0, reg1, reg2; - v16i8 zero = {0}; - v8i16 max = __msa_ldi_h(0xFF); - - vec_d0 = (v8i16)__msa_fill_w(dither4); - vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); - reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); - reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); - reg0 += vec_d0; - reg1 += vec_d0; - reg2 += vec_d0; - reg0 = __msa_maxi_s_h((v8i16)reg0, 0); - reg1 = __msa_maxi_s_h((v8i16)reg1, 0); - reg2 = __msa_maxi_s_h((v8i16)reg2, 0); - reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); - reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); - reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); - reg0 = __msa_srai_h(reg0, 3); - reg2 = __msa_srai_h(reg2, 3); - reg1 = __msa_srai_h(reg1, 2); - reg2 = __msa_slli_h(reg2, 11); - reg1 = __msa_slli_h(reg1, 5); - reg0 |= reg1; - dst0 = (v16u8)(reg0 | reg2); - ST_UB(dst0, dst_rgb); - src_argb += 32; - dst_rgb += 16; - } -} - -void ARGBShuffleRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - v16i8 vec0; - v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - int32_t val = LW((int32_t*)shuffler); - - vec0 = (v16i8)__msa_fill_w(val); - shuffler_vec += vec0; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBShadeRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - int x; - v16u8 src0, dst0; - v8u16 vec0, vec1; - v4u32 reg0, reg1, reg2, reg3, rgba_scale; - v8i16 zero = {0}; - - rgba_scale[0] = value; - rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); - rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); - reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); - reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); - reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); - reg0 *= rgba_scale; - reg1 *= rgba_scale; - reg2 *= rgba_scale; - reg3 *= rgba_scale; - reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); - reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); - reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); - reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_argb); - src_argb += 16; - dst_argb += 16; - } -} - -void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, vec0, vec1, dst0, dst1; - v8u16 reg0; - v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); - v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); - vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - reg0 = __msa_dotp_u_h(vec0, const_0x961D); - reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); - vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); - vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2; - v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); - v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); - v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); - v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); - v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); - v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); - v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); - vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); - vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); - reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); - reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); - reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); - reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); - reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); - reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); - reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); - reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); - vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); - vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); - vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); - vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); - ST_UB2(dst0, dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1; - v8u16 vec0, vec1, vec2, vec3; - v16u8 dst0, dst1, dst2, dst3; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); - vec0 = (v8u16)__msa_andi_b(src0, 0x0F); - vec1 = (v8u16)__msa_andi_b(src1, 0x0F); - vec2 = (v8u16)__msa_andi_b(src0, 0xF0); - vec3 = (v8u16)__msa_andi_b(src1, 0xF0); - vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); - vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); - vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); - vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_argb4444 += 32; - dst_argb += 64; - } -} - -void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - int x; - v8u16 src0, src1; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; - v16u8 dst0, dst1, dst2, dst3; - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); - reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); - reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); - reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); - reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); - reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); - reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); - reg3 = -reg3; - reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); - reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); - reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); - reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); - dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_argb1555 += 32; - dst_argb += 64; - } -} - -void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); - v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16); - vec0 = src0 & const_0x1F; - vec1 = src0 & const_0x7E0; - vec2 = src0 & const_0xF800; - vec3 = src1 & const_0x1F; - vec4 = src1 & const_0x7E0; - vec5 = src1 & const_0xF800; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); - reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); - reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); - reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); - reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); - reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); - reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); - res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); - res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); - res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_rgb565 += 32; - dst_argb += 64; - } -} - -void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2; - v16u8 vec0, vec1, vec2; - v16u8 dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0); - src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16); - src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32); - vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); - vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); - dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); - dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_rgb24 += 48; - dst_argb += 64; - } -} - -void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, src1, src2; - v16u8 vec0, vec1, vec2; - v16u8 dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); - vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); - vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); - dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); - dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_raw += 48; - dst_argb += 64; - } -} - -void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16u8 dst0; - v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); - v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); - v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); - reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); - reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); - reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); - reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); - reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); - reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); - reg0 *= const_0x19; - reg1 *= const_0x19; - reg2 *= const_0x81; - reg3 *= const_0x81; - reg4 *= const_0x42; - reg5 *= const_0x42; - reg0 += reg2; - reg1 += reg3; - reg0 += reg4; - reg1 += reg5; - reg0 += const_0x1080; - reg1 += const_0x1080; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_y); - src_argb1555 += 32; - dst_y += 16; - } -} - -void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v4u32 res0, res1, res2, res3; - v16u8 dst0; - v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); - v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); - v8i16 const_0x1080 = __msa_fill_h(0x1080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); - v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16); - vec0 = src0 & const_0x1F; - vec1 = src0 & const_0x7E0; - vec2 = src0 & const_0xF800; - vec3 = src1 & const_0x1F; - vec4 = src1 & const_0x7E0; - vec5 = src1 & const_0xF800; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); - reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); - reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); - reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); - reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); - reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); - reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); - vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); - vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); - vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); - vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); - vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); - vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); - vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); - res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); - res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); - res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); - res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); - res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); - res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); - res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); - res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); - res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); - res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); - res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); - res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); - vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); - vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); - src_rgb565 += 32; - dst_y += 16; - } -} - -void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); - v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, - 18, 19, 20, 21, 21, 22, 23, 24}; - v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; - v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); - reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); - reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); - vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); - vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); - vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); - vec0 += const_0x1080; - vec1 += const_0x1080; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); - src_argb += 48; - dst_y += 16; - } -} - -void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); - v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, - 18, 19, 20, 21, 21, 22, 23, 24}; - v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; - v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); - reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); - reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); - vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); - vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); - vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); - vec0 += const_0x1080; - vec1 += const_0x1080; - vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); - src_argb += 48; - dst_y += 16; - } -} - -void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint16_t* s = (const uint16_t*)src_argb1555; - const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); - int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)s, 0); - src1 = (v8u16)__msa_ld_b((void*)s, 16); - src2 = (v8u16)__msa_ld_b((void*)t, 0); - src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - vec2 += src2 & const_0x1F; - vec3 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); - vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); - reg0 = vec6 * const_0x70; - reg1 = vec0 * const_0x4A; - reg2 = vec2 * const_0x70; - reg3 = vec0 * const_0x5E; - reg0 += const_0x8080; - reg1 += vec2 * const_0x26; - reg2 += const_0x8080; - reg3 += vec6 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - s += 16; - t += 16; - dst_u += 8; - dst_v += 8; - } -} - -void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint16_t* s = (const uint16_t*)src_rgb565; - const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); - int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); - - for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)s, 0); - src1 = (v8u16)__msa_ld_b((void*)s, 16); - src2 = (v8u16)__msa_ld_b((void*)t, 0); - src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x3F; - vec3 = src1 & const_0x3F; - vec2 += src2 & const_0x3F; - vec3 += src3 & const_0x3F; - vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - reg0 = vec3 * const_0x70; - reg1 = vec1 * const_0x4A; - reg2 = vec4 * const_0x70; - reg3 = vec1 * const_0x5E; - reg0 += const_32896; - reg1 += vec4 * const_0x26; - reg2 += const_32896; - reg3 += vec3 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - s += 16; - t += 16; - dst_u += 8; - dst_v += 8; - } -} - -void RGB24ToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - int64_t res0, res1; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 inp0, inp1, inp2, inp3, inp4, inp5; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 reg0, reg1, reg2, reg3; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((void*)s, 0); - inp1 = (v16u8)__msa_ld_b((void*)s, 16); - inp2 = (v16u8)__msa_ld_b((void*)s, 32); - inp3 = (v16u8)__msa_ld_b((void*)t, 0); - inp4 = (v16u8)__msa_ld_b((void*)t, 16); - inp5 = (v16u8)__msa_ld_b((void*)t, 32); - src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); - src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); - src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); - src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); - src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); - src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); - src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); - src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); - src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); - src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); - src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); - src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); - src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); - src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); - vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); - reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); - reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); - reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); - reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); - reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg0 = __msa_srai_h((v8i16)reg0, 1); - reg1 = __msa_srai_h((v8i16)reg1, 1); - reg2 = __msa_srai_h((v8i16)reg2, 1); - reg3 = __msa_srai_h((v8i16)reg3, 1); - vec4 = (v8u16)__msa_pckev_h(reg1, reg0); - vec5 = (v8u16)__msa_pckev_h(reg3, reg2); - vec6 = (v8u16)__msa_pckod_h(reg1, reg0); - vec7 = (v8u16)__msa_pckod_h(reg3, reg2); - vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); - vec3 = vec0 * const_0x70; - vec4 = vec1 * const_0x4A; - vec5 = vec2 * const_0x26; - vec2 *= const_0x70; - vec1 *= const_0x5E; - vec0 *= const_0x12; - reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); - reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); - reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); - reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); - reg0 += reg1; - reg2 += reg3; - reg0 = __msa_srai_h(reg0, 8); - reg2 = __msa_srai_h(reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - t += 48; - s += 48; - dst_u += 8; - dst_v += 8; - } -} - -void RAWToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - int64_t res0, res1; - v16u8 inp0, inp1, inp2, inp3, inp4, inp5; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 reg0, reg1, reg2, reg3; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - inp0 = (v16u8)__msa_ld_b((void*)s, 0); - inp1 = (v16u8)__msa_ld_b((void*)s, 16); - inp2 = (v16u8)__msa_ld_b((void*)s, 32); - inp3 = (v16u8)__msa_ld_b((void*)t, 0); - inp4 = (v16u8)__msa_ld_b((void*)t, 16); - inp5 = (v16u8)__msa_ld_b((void*)t, 32); - src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); - src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); - src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); - src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); - src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); - src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); - src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); - src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); - src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); - src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); - src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); - src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); - src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); - src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); - vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); - reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); - reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); - reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); - reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); - reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); - reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); - reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 += const_0x0001; - reg1 += const_0x0001; - reg2 += const_0x0001; - reg3 += const_0x0001; - reg0 = __msa_srai_h(reg0, 1); - reg1 = __msa_srai_h(reg1, 1); - reg2 = __msa_srai_h(reg2, 1); - reg3 = __msa_srai_h(reg3, 1); - vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); - vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); - vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); - vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - vec3 = vec0 * const_0x70; - vec4 = vec1 * const_0x4A; - vec5 = vec2 * const_0x26; - vec2 *= const_0x70; - vec1 *= const_0x5E; - vec0 *= const_0x12; - reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); - reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); - reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); - reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); - reg0 += reg1; - reg2 += reg3; - reg0 = __msa_srai_h(reg0, 8); - reg2 = __msa_srai_h(reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); - SD(res0, dst_u); - SD(res1, dst_v); - t += 48; - s += 48; - dst_u += 8; - dst_v += 8; - } -} - -void NV12ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, res0, res1, dst0, dst1; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 zero = {0}; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_uv); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_uv += 8; - dst_argb += 32; - } -} - -void NV12ToRGB565Row_MSA(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, dst0; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 zero = {0}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_uv); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - vec0 = vec0 >> 3; - vec1 = (vec1 >> 2) << 5; - vec2 = (vec2 >> 3) << 11; - dst0 = (v16u8)(vec0 | vec1 | vec2); - ST_UB(dst0, dst_rgb565); - src_y += 8; - src_uv += 8; - dst_rgb565 += 16; - } -} - -void NV21ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - uint64_t val0, val1; - v16u8 src0, src1, res0, res1, dst0, dst1; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v16u8 zero = {0}; - v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - val0 = LD(src_y); - val1 = LD(src_vu); - src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); - src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); - res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_vu += 8; - dst_argb += 32; - } -} - -void SobelRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; - v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; - v16i8 const_0x4 = __msa_ldi_b(0x4); - v16i8 mask1 = mask0 + const_0x4; - v16i8 mask2 = mask1 + const_0x4; - v16i8 mask3 = mask2 + const_0x4; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - vec0 = __msa_adds_u_b(src0, src1); - dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); - dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); - dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); - dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16); - src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16); - dst0 = __msa_adds_u_b(src0, src2); - dst1 = __msa_adds_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_y, 16); - src_sobelx += 32; - src_sobely += 32; - dst_y += 32; - } -} - -void SobelXYRow_MSA(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, vec0, vec1, vec2; - v16u8 reg0, reg1, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); - src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); - vec0 = __msa_adds_u_b(src0, src1); - vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); - vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); - reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); - reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); - dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); - dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_sobelx += 16; - src_sobely += 16; - dst_argb += 64; - } -} - -void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); - v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); - v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); - v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); - v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - v16u8 src0, src1, src2, src3, dst0; - v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); - v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, - dst0); - ST_UB(dst0, dst_y); - src_argb += 64; - dst_y += 16; - } -} - -void ARGBToUVJRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - v8u16 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 vec0, vec1, vec2, vec3; - v8u16 dst0, dst1, dst2, dst3; - v16u8 zero = {0}; - v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; - v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; - v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; - v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); - v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); - v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); - v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); - v4i32 shift = __msa_fill_w(0x00000008); - - for (x = 0; x < width; x += 32) { - src1 = __msa_ld_b((void*)s, 0); - src3 = __msa_ld_b((void*)s, 16); - src5 = __msa_ld_b((void*)t, 0); - src7 = __msa_ld_b((void*)t, 16); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec0 = __msa_aver_u_h(src4, src5); - vec1 = __msa_aver_u_h(src6, src7); - - src1 = __msa_ld_b((void*)s, 32); - src3 = __msa_ld_b((void*)s, 48); - src5 = __msa_ld_b((void*)t, 32); - src7 = __msa_ld_b((void*)t, 48); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec2 = __msa_aver_u_h(src4, src5); - vec3 = __msa_aver_u_h(src6, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, - const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, - shuffler2, shuffler3, shift, dst0, dst1); - - src1 = __msa_ld_b((void*)s, 64); - src3 = __msa_ld_b((void*)s, 80); - src5 = __msa_ld_b((void*)t, 64); - src7 = __msa_ld_b((void*)t, 80); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec0 = __msa_aver_u_h(src4, src5); - vec1 = __msa_aver_u_h(src6, src7); - - src1 = __msa_ld_b((void*)s, 96); - src3 = __msa_ld_b((void*)s, 112); - src5 = __msa_ld_b((void*)t, 96); - src7 = __msa_ld_b((void*)t, 112); - src0 = __msa_ilvr_b(zero, src1); - src1 = __msa_ilvl_b(zero, src1); - src2 = __msa_ilvr_b(zero, src3); - src3 = __msa_ilvl_b(zero, src3); - src4 = __msa_ilvr_b(zero, src5); - src5 = __msa_ilvl_b(zero, src5); - src6 = __msa_ilvr_b(zero, src7); - src7 = __msa_ilvl_b(zero, src7); - src0 += src4; - src1 += src5; - src2 += src6; - src3 += src7; - src4 = __msa_ilvev_d(src1, src0); - src5 = __msa_ilvod_d(src1, src0); - src6 = __msa_ilvev_d(src3, src2); - src7 = __msa_ilvod_d(src3, src2); - vec2 = __msa_aver_u_h(src4, src5); - vec3 = __msa_aver_u_h(src6, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, - const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, - shuffler2, shuffler3, shift, dst2, dst3); - - dst0 = (v8u16)__msa_pckev_b(dst2, dst0); - dst1 = (v8u16)__msa_pckev_b(dst3, dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_v += 16; - dst_u += 16; - } -} - -void BGRAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; - v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; - v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; - v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void ABGRToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; - v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; - v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; - v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void RGBAToUVRow_MSA(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - const uint8_t* s = src_rgb; - const uint8_t* t = src_rgb + src_stride_rgb; - const uint8_t unused = 0xf; - v8u16 src0, src1, src2, src3; - v16u8 dst0, dst1; - v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; - v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; - v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; - v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; - v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); - v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); - v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); - v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); - v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - - for (x = 0; x < width; x += 16) { - READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); - ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, - const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, - shuffler3, dst0, dst1); - *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); - *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); - s += 64; - t += 64; - dst_u += 8; - dst_v += 8; - } -} - -void I444ToARGBRow_MSA(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2, dst0, dst1; - v8u16 vec0, vec1, vec2; - v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 zero = {0}; - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - - for (x = 0; x < width; x += 8) { - READI444(src_y, src_u, src_v, src0, src1, src2); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - reg0 *= vec_yg; - reg1 *= vec_yg; - reg0 = __msa_srai_w(reg0, 16); - reg1 = __msa_srai_w(reg1, 16); - reg4 = reg0 + vec_br; - reg5 = reg1 + vec_br; - reg2 = reg0 + vec_bg; - reg3 = reg1 + vec_bg; - reg0 += vec_bb; - reg1 += vec_bb; - vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); - reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); - reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - reg0 -= reg6 * vec_ub; - reg1 -= reg7 * vec_ub; - reg2 -= reg6 * vec_ug; - reg3 -= reg7 * vec_ug; - reg4 -= reg8 * vec_vr; - reg5 -= reg9 * vec_vr; - reg2 -= reg8 * vec_vg; - reg3 -= reg9 * vec_vg; - reg0 = __msa_srai_w(reg0, 6); - reg1 = __msa_srai_w(reg1, 6); - reg2 = __msa_srai_w(reg2, 6); - reg3 = __msa_srai_w(reg3, 6); - reg4 = __msa_srai_w(reg4, 6); - reg5 = __msa_srai_w(reg5, 6); - CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); - vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); - vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); - dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); - dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); - ST_UB2(dst0, dst1, dst_argb, 16); - src_y += 8; - src_u += 8; - src_v += 8; - dst_argb += 32; - } -} - -// TODO - respect YuvConstants -void I400ToARGBRow_MSA(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; -#if defined(__aarch64__) || defined(__arm__) - int ygb = yuvconstants->kUVBiasBGR[3]; - int yg = yuvconstants->kYToRgb[1]; -#else - int ygb = yuvconstants->kYBiasToRgb[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; - v8i16 vec0, vec1; - v4i32 reg0, reg1, reg2, reg3; - v4i32 vec_yg = __msa_fill_w(yg); - v8i16 vec_ygb = __msa_fill_h(ygb); - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - v8i16 max = __msa_ldi_h(0xFF); - v8i16 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y, 0); - vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - reg0 = (v4i32)__msa_ilvr_h(zero, vec0); - reg1 = (v4i32)__msa_ilvl_h(zero, vec0); - reg2 = (v4i32)__msa_ilvr_h(zero, vec1); - reg3 = (v4i32)__msa_ilvl_h(zero, vec1); - reg0 *= vec_yg; - reg1 *= vec_yg; - reg2 *= vec_yg; - reg3 *= vec_yg; - reg0 = __msa_srai_w(reg0, 16); - reg1 = __msa_srai_w(reg1, 16); - reg2 = __msa_srai_w(reg2, 16); - reg3 = __msa_srai_w(reg3, 16); - vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - vec0 += vec_ygb; - vec1 += vec_ygb; - vec0 = __msa_srai_h(vec0, 6); - vec1 = __msa_srai_h(vec1, 6); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); - res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); - res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); - res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); - dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); - dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); - dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_y += 16; - dst_argb += 64; - } -} - -void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { - int x; - v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y, 0); - vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); - vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); - vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); - dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); - dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - src_y += 16; - dst_argb += 64; - } -} - -void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); - src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_yuy2 += 16; - dst_argb += 32; - } -} - -void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - int x; - v16u8 src0, src1, src2; - v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; - v4i32 vec_ubvr, vec_ugvg; - v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); - vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); - vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); - src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - STOREARGB(vec0, vec1, vec2, alpha, dst_argb); - src_uyvy += 16; - dst_argb += 32; - } -} - -void InterpolateRow_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int32_t source_y_fraction) { - int32_t y1_fraction = source_y_fraction; - int32_t y0_fraction = 256 - y1_fraction; - uint16_t y_fractions; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3, y_frac; - - if (0 == y1_fraction) { - memcpy(dst_ptr, src_ptr, width); - return; - } - - if (128 == y1_fraction) { - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)t, 0); - src3 = (v16u8)__msa_ld_b((void*)t, 16); - dst0 = __msa_aver_u_b(src0, src2); - dst1 = __msa_aver_u_b(src1, src3); - ST_UB2(dst0, dst1, dst_ptr, 16); - s += 32; - t += 32; - dst_ptr += 32; - } - return; - } - - y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); - y_frac = (v8u16)__msa_fill_h(y_fractions); - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)t, 0); - src3 = (v16u8)__msa_ld_b((void*)t, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); - vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); - vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); - vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); - vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); - vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); - vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); - vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - ST_UB2(dst0, dst1, dst_ptr, 16); - s += 32; - t += 32; - dst_ptr += 32; - } -} - -void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { - int x; - v4i32 dst0 = __builtin_msa_fill_w(v32); - - for (x = 0; x < width; x += 4) { - ST_UB(dst0, dst_argb); - dst_argb += 16; - } -} - -void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - int x; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; - v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; - v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, - 18, 17, 16, 21, 20, 19, 24, 23}; - v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, - 24, 23, 28, 27, 26, 31, 30, 29}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); - src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); - src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); - src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); - src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); - dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); - dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); - ST_UB2(dst0, dst1, dst_rgb24, 16); - ST_UB(dst2, (dst_rgb24 + 32)); - src_raw += 48; - dst_rgb24 += 48; - } -} - -void MergeUVRow_MSA(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - int x; - v16u8 src0, src1, dst0, dst1; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_u, 0); - src1 = (v16u8)__msa_ld_b((void*)src_v, 0); - dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); - ST_UB2(dst0, dst1, dst_uv, 16); - src_u += 16; - src_v += 16; - dst_uv += 32; - } -} - -void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - int i; - v16u8 src0, src1, src2, src3, vec0, vec1, dst0; - - for (i = 0; i < width; i += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); - vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_a); - src_argb += 64; - dst_a += 16; - } -} - -void ARGBBlendRow_MSA(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 vec8, vec9, vec10, vec11, vec12, vec13; - v8u16 const_256 = (v8u16)__msa_ldi_h(256); - v16u8 const_255 = (v16u8)__msa_ldi_b(255); - v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); - vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); - vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); - vec8 = (v8u16)__msa_fill_h(vec0[3]); - vec9 = (v8u16)__msa_fill_h(vec0[7]); - vec10 = (v8u16)__msa_fill_h(vec1[3]); - vec11 = (v8u16)__msa_fill_h(vec1[7]); - vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); - vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); - vec10 = (v8u16)__msa_fill_h(vec2[3]); - vec11 = (v8u16)__msa_fill_h(vec2[7]); - vec12 = (v8u16)__msa_fill_h(vec3[3]); - vec13 = (v8u16)__msa_fill_h(vec3[7]); - vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); - vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); - vec8 = const_256 - vec8; - vec9 = const_256 - vec9; - vec10 = const_256 - vec10; - vec11 = const_256 - vec11; - vec8 *= vec4; - vec9 *= vec5; - vec10 *= vec6; - vec11 *= vec7; - vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); - vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); - vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); - vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); - dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); - dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); - dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); - dst0 = __msa_bmnz_v(dst0, const_255, mask); - dst1 = __msa_bmnz_v(dst1, const_255, mask); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - src_argb1 += 32; - dst_argb += 32; - } -} - -void ARGBQuantizeRow_MSA(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - v4i32 vec_scale = __msa_fill_w(scale); - v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); - v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); - v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; - v16i8 zero = {0}; - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); - src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); - src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); - src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48); - vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); - vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); - tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); - tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); - tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); - tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); - tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); - tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); - tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); - tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); - tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); - tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); - tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); - tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); - tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); - tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); - tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); - tmp0 *= vec_scale; - tmp1 *= vec_scale; - tmp2 *= vec_scale; - tmp3 *= vec_scale; - tmp4 *= vec_scale; - tmp5 *= vec_scale; - tmp6 *= vec_scale; - tmp7 *= vec_scale; - tmp8 *= vec_scale; - tmp9 *= vec_scale; - tmp10 *= vec_scale; - tmp11 *= vec_scale; - tmp12 *= vec_scale; - tmp13 *= vec_scale; - tmp14 *= vec_scale; - tmp15 *= vec_scale; - tmp0 >>= 16; - tmp1 >>= 16; - tmp2 >>= 16; - tmp3 >>= 16; - tmp4 >>= 16; - tmp5 >>= 16; - tmp6 >>= 16; - tmp7 >>= 16; - tmp8 >>= 16; - tmp9 >>= 16; - tmp10 >>= 16; - tmp11 >>= 16; - tmp12 >>= 16; - tmp13 >>= 16; - tmp14 >>= 16; - tmp15 >>= 16; - vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); - vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); - vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); - vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); - dst0 *= vec_int_sz; - dst1 *= vec_int_sz; - dst2 *= vec_int_sz; - dst3 *= vec_int_sz; - dst0 += vec_int_ofst; - dst1 += vec_int_ofst; - dst2 += vec_int_ofst; - dst3 += vec_int_ofst; - dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); - dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); - dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); - dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); - dst_argb += 64; - } -} - -void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - int32_t x; - v16i8 src0; - v16u8 src1, src2, dst0, dst1; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; - v16i8 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - src0 = __msa_ld_b((void*)matrix_argb, 0); - vec0 = (v8i16)__msa_ilvr_b(zero, src0); - vec1 = (v8i16)__msa_ilvl_b(zero, src0); - - for (x = 0; x < width; x += 8) { - src1 = (v16u8)__msa_ld_b((void*)src_argb, 0); - src2 = (v16u8)__msa_ld_b((void*)src_argb, 16); - vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); - vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); - vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); - vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); - vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); - vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); - vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); - vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); - vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); - vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); - vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); - vec10 = vec2 * vec0; - vec11 = vec2 * vec1; - vec12 = vec6 * vec0; - vec13 = vec6 * vec1; - tmp0 = __msa_hadd_s_w(vec10, vec10); - tmp1 = __msa_hadd_s_w(vec11, vec11); - tmp2 = __msa_hadd_s_w(vec12, vec12); - tmp3 = __msa_hadd_s_w(vec13, vec13); - vec14 = vec3 * vec0; - vec15 = vec3 * vec1; - vec16 = vec7 * vec0; - vec17 = vec7 * vec1; - tmp4 = __msa_hadd_s_w(vec14, vec14); - tmp5 = __msa_hadd_s_w(vec15, vec15); - tmp6 = __msa_hadd_s_w(vec16, vec16); - tmp7 = __msa_hadd_s_w(vec17, vec17); - vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - tmp0 = __msa_hadd_s_w(vec10, vec10); - tmp1 = __msa_hadd_s_w(vec11, vec11); - tmp2 = __msa_hadd_s_w(vec12, vec12); - tmp3 = __msa_hadd_s_w(vec13, vec13); - tmp0 = __msa_srai_w(tmp0, 6); - tmp1 = __msa_srai_w(tmp1, 6); - tmp2 = __msa_srai_w(tmp2, 6); - tmp3 = __msa_srai_w(tmp3, 6); - vec2 = vec4 * vec0; - vec6 = vec4 * vec1; - vec3 = vec8 * vec0; - vec7 = vec8 * vec1; - tmp8 = __msa_hadd_s_w(vec2, vec2); - tmp9 = __msa_hadd_s_w(vec6, vec6); - tmp10 = __msa_hadd_s_w(vec3, vec3); - tmp11 = __msa_hadd_s_w(vec7, vec7); - vec4 = vec5 * vec0; - vec8 = vec5 * vec1; - vec5 = vec9 * vec0; - vec9 = vec9 * vec1; - tmp12 = __msa_hadd_s_w(vec4, vec4); - tmp13 = __msa_hadd_s_w(vec8, vec8); - tmp14 = __msa_hadd_s_w(vec5, vec5); - tmp15 = __msa_hadd_s_w(vec9, vec9); - vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); - vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); - vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); - vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); - tmp4 = __msa_hadd_s_w(vec14, vec14); - tmp5 = __msa_hadd_s_w(vec15, vec15); - tmp6 = __msa_hadd_s_w(vec16, vec16); - tmp7 = __msa_hadd_s_w(vec17, vec17); - tmp4 = __msa_srai_w(tmp4, 6); - tmp5 = __msa_srai_w(tmp5, 6); - tmp6 = __msa_srai_w(tmp6, 6); - tmp7 = __msa_srai_w(tmp7, 6); - vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); - vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); - vec10 = __msa_maxi_s_h(vec10, 0); - vec11 = __msa_maxi_s_h(vec11, 0); - vec12 = __msa_maxi_s_h(vec12, 0); - vec13 = __msa_maxi_s_h(vec13, 0); - vec10 = __msa_min_s_h(vec10, max); - vec11 = __msa_min_s_h(vec11, max); - vec12 = __msa_min_s_h(vec12, max); - vec13 = __msa_min_s_h(vec13, max); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); - ST_UB2(dst0, dst1, dst_argb, 16); - src_argb += 32; - dst_argb += 32; - } -} - -void SplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - - for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)src_uv, 0); - src1 = (v16u8)__msa_ld_b((void*)src_uv, 16); - src2 = (v16u8)__msa_ld_b((void*)src_uv, 32); - src3 = (v16u8)__msa_ld_b((void*)src_uv, 48); - dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_u, 16); - ST_UB2(dst2, dst3, dst_v, 16); - src_uv += 64; - dst_u += 32; - dst_v += 32; - } -} - -void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { - int x; - v16u8 dst0 = (v16u8)__msa_fill_b(v8); - - for (x = 0; x < width; x += 16) { - ST_UB(dst0, dst); - dst += 16; - } -} - -void MirrorSplitUVRow_MSA(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - int x; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; - v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; - - src_uv += (2 * width); - - for (x = 0; x < width; x += 32) { - src_uv -= 64; - src2 = (v16u8)__msa_ld_b((void*)src_uv, 0); - src3 = (v16u8)__msa_ld_b((void*)src_uv, 16); - src0 = (v16u8)__msa_ld_b((void*)src_uv, 32); - src1 = (v16u8)__msa_ld_b((void*)src_uv, 48); - dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst_v, 16); - ST_UB2(dst2, dst3, dst_u, 16); - dst_u += 32; - dst_v += 32; - } -} - -void SobelXRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int32_t width) { - int x; - v16u8 src0, src1, src2, src3, src4, src5, dst0; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; - v16i8 tmp = __msa_ldi_b(8); - v16i8 mask1 = mask0 + tmp; - v8i16 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_y0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_y1, 0); - src3 = (v16u8)__msa_ld_b((void*)src_y1, 16); - src4 = (v16u8)__msa_ld_b((void*)src_y2, 0); - src5 = (v16u8)__msa_ld_b((void*)src_y2, 16); - vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); - vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); - vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); - vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); - vec0 += vec2; - vec1 += vec3; - vec4 += vec2; - vec5 += vec3; - vec0 += vec4; - vec1 += vec5; - vec0 = __msa_add_a_h(zero, vec0); - vec1 = __msa_add_a_h(zero, vec1); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_sobelx); - src_y0 += 16; - src_y1 += 16; - src_y2 += 16; - dst_sobelx += 16; - } -} - -void SobelYRow_MSA(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int32_t width) { - int x; - v16u8 src0, src1, dst0; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; - v8i16 zero = {0}; - v8i16 max = __msa_ldi_h(255); - - for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_y1, 0); - vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); - vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); - vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); - vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); - vec0 -= vec2; - vec1 -= vec3; - vec6[0] = src_y0[16] - src_y1[16]; - vec6[1] = src_y0[17] - src_y1[17]; - vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); - vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); - vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); - vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); - vec0 += vec2; - vec1 += vec3; - vec4 += vec2; - vec5 += vec3; - vec0 += vec4; - vec1 += vec5; - vec0 = __msa_add_a_h(zero, vec0); - vec1 = __msa_add_a_h(zero, vec1); - vec0 = __msa_maxi_s_h(vec0, 0); - vec1 = __msa_maxi_s_h(vec1, 0); - vec0 = __msa_min_s_h(max, vec0); - vec1 = __msa_min_s_h(max, vec1); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_sobely); - src_y0 += 16; - src_y1 += 16; - dst_sobely += 16; - } -} - -void HalfFloatRow_MSA(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - int i; - v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; - v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; - v4f32 mult_vec; - v8i16 zero = {0}; - mult_vec[0] = 1.9259299444e-34f * scale; - mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); - - for (i = 0; i < width; i += 32) { - src0 = (v8u16)__msa_ld_h((void*)src, 0); - src1 = (v8u16)__msa_ld_h((void*)src, 16); - src2 = (v8u16)__msa_ld_h((void*)src, 32); - src3 = (v8u16)__msa_ld_h((void*)src, 48); - vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); - vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); - vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); - vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); - vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); - vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); - vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); - vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); - fvec0 = __msa_ffint_u_w(vec0); - fvec1 = __msa_ffint_u_w(vec1); - fvec2 = __msa_ffint_u_w(vec2); - fvec3 = __msa_ffint_u_w(vec3); - fvec4 = __msa_ffint_u_w(vec4); - fvec5 = __msa_ffint_u_w(vec5); - fvec6 = __msa_ffint_u_w(vec6); - fvec7 = __msa_ffint_u_w(vec7); - fvec0 *= mult_vec; - fvec1 *= mult_vec; - fvec2 *= mult_vec; - fvec3 *= mult_vec; - fvec4 *= mult_vec; - fvec5 *= mult_vec; - fvec6 *= mult_vec; - fvec7 *= mult_vec; - vec0 = ((v4u32)fvec0) >> 13; - vec1 = ((v4u32)fvec1) >> 13; - vec2 = ((v4u32)fvec2) >> 13; - vec3 = ((v4u32)fvec3) >> 13; - vec4 = ((v4u32)fvec4) >> 13; - vec5 = ((v4u32)fvec5) >> 13; - vec6 = ((v4u32)fvec6) >> 13; - vec7 = ((v4u32)fvec7) >> 13; - dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); - dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); - dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); - dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); - ST_UH2(dst0, dst1, dst, 8); - ST_UH2(dst2, dst3, dst + 16, 8); - src += 32; - dst += 32; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/thirdparty/libyuv/source/row_neon.cc b/thirdparty/libyuv/source/row_neon.cc deleted file mode 100644 index ccc4af6..0000000 --- a/thirdparty/libyuv/source/row_neon.cc +++ /dev/null @@ -1,3577 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#include - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// q0: Y uint16x8_t -// d2: U uint8x8_t -// d3: V uint8x8_t - -// Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.32 {d2[0]}, [%[src_u]]! \n" \ - "vld1.32 {d2[1]}, [%[src_v]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmovl.u8 q1, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsli.u16 q1, q1, #8 \n" - -// Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_u]]! \n" \ - "vmovl.u8 q0, d0 \n" \ - "vld1.8 {d3}, [%[src_v]]! \n" \ - "vsli.u16 q0, q0, #8 \n" - -// Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vmov.u8 q1, #128 \n" \ - "vmovl.u8 q0, d0 \n" \ - "vsli.u16 q0, q0, #8 \n" - -// Read 8 Y and 4 UV from NV12 -#define READNV12 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_uv]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ - "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ - -// Read 8 Y and 4 VU from NV21 -#define READNV21 \ - "vld1.8 {d0}, [%[src_y]]! \n" \ - "vld1.8 {d2}, [%[src_vu]]! \n" \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d0, d1 \n" \ - "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ - "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ - -// Read 8 YUY2 -#define READYUY2 \ - "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ - "vmovl.u8 q0, d0 \n" \ - "vmov.u8 d3, d2 \n" \ - "vsli.u16 q0, q0, #8 \n" \ - "vsli.u16 d2, d2, #8 \n" \ - "vsri.u16 d3, d3, #8 \n" - -// Read 8 UYVY -#define READUYVY \ - "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ - "vmovl.u8 q0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vsli.u16 q0, q0, #8 \n" \ - "vsli.u16 d2, d2, #8 \n" \ - "vsri.u16 d3, d3, #8 \n" - -#define YUVTORGB_SETUP \ - "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ - "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" - -// q0: B uint16x8_t -// q1: G uint16x8_t -// q2: R uint16x8_t - -// Convert from YUV to 2.14 fixed point RGB -#define YUVTORGB \ - "vmull.u16 q2, d1, d31 \n" \ - "vmull.u8 q8, d3, d29 \n" /* DGV */ \ - "vmull.u16 q0, d0, d31 \n" \ - "vmlal.u8 q8, d2, d28 \n" /* DG */ \ - "vqshrn.u32 d0, q0, #16 \n" \ - "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ - "vmull.u8 q9, d2, d26 \n" /* DB */ \ - "vmull.u8 q2, d3, d27 \n" /* DR */ \ - "vadd.u16 q4, q0, q11 \n" /* G */ \ - "vadd.u16 q2, q0, q2 \n" /* R */ \ - "vadd.u16 q0, q0, q9 \n" /* B */ \ - "vqsub.u16 q1, q4, q8 \n" /* G */ \ - "vqsub.u16 q0, q0, q10 \n" /* B */ \ - "vqsub.u16 q2, q2, q12 \n" /* R */ - -// Convert from 2.14 fixed point RGB To 8 bit RGB -#define RGBTORGB8 \ - "vqshrn.u16 d4, q2, #6 \n" /* R */ \ - "vqshrn.u16 d2, q1, #6 \n" /* G */ \ - "vqshrn.u16 d0, q0, #6 \n" /* B */ - -#define YUVTORGB_REGS \ - "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" - -#define STORERGBA \ - "vmov.u8 d1, d0 \n" \ - "vmov.u8 d3, d4 \n" \ - "vmov.u8 d0, d6 \n" \ - "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" - -void I444ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I444AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "vld1.8 {d6}, [%[src_a]]! \n" - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422ToRGBARow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void I422ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTORGB565 \ - "vshll.u8 q2, d4, #8 \n" /* R */ \ - "vshll.u8 q1, d2, #8 \n" /* G */ \ - "vshll.u8 q0, d0, #8 \n" /* B */ \ - "vsri.16 q2, q1, #5 \n" /* RG */ \ - "vsri.16 q2, q0, #11 \n" /* RGB */ - -void I422ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 - "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTOARGB1555 \ - "vshll.u8 q3, d6, #8 \n" /* A */ \ - "vshll.u8 q2, d4, #8 \n" /* R */ \ - "vshll.u8 q1, d2, #8 \n" /* G */ \ - "vshll.u8 q0, d0, #8 \n" /* B */ \ - "vsri.16 q3, q2, #1 \n" /* AR */ \ - "vsri.16 q3, q1, #6 \n" /* ARG */ \ - "vsri.16 q3, q0, #11 \n" /* ARGB */ - -void I422ToARGB1555Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 - "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "q3"); -} - -#define ARGBTOARGB4444 \ - "vshr.u8 d0, d0, #4 \n" /* B */ \ - "vbic.32 d2, d2, d7 \n" /* G */ \ - "vshr.u8 d4, d4, #4 \n" /* R */ \ - "vbic.32 d6, d6, d7 \n" /* A */ \ - "vorr d0, d0, d2 \n" /* BG */ \ - "vorr d1, d4, d6 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ - -void I422ToARGB4444Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "vmov.u8 d7, #0x0f \n" // vbic bits to clear - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "q3"); -} - -void I400ToARGBRow_NEON(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUV400 YUVTORGB - RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "vmov.u8 d23, #255 \n" - "1: \n" - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d20", "d21", "d22", "d23"); -} - -void NV12ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void NV21ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_vu] "+r"(src_vu), // %[src_vu] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void NV12ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void NV21ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_vu] "+r"(src_vu), // %[src_vu] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void NV12ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" ARGBTORGB565 - "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d6, #255 \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 - "subs %[width], %[width], #8 \n" - "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" - "bgt 1b \n" - : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "d6"); -} - -// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store U - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load U - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. -void SplitRGBRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB - "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%1]! \n" // store R - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%3]! \n" // store B - "bgt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "d0", "d1", "d2" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time -void MergeRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q2}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB - "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. -void SplitARGBRow_NEON(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB - "subs %5, %5, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%3]! \n" // store B - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%1]! \n" // store R - "vst1.8 {q3}, [%4]! \n" // store A - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q0}, [%2]! \n" // load B - "vld1.8 {q3}, [%3]! \n" // load A - "subs %5, %5, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. -void SplitXRGBRow_NEON(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB - "subs %4, %4, #16 \n" // 16 processed per loop - "vst1.8 {q0}, [%3]! \n" // store B - "vst1.8 {q1}, [%2]! \n" // store G - "vst1.8 {q2}, [%1]! \n" // store R - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time -void MergeXRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 q3, #255 \n" // load A(255) - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load R - "vld1.8 {q1}, [%1]! \n" // load G - "vld1.8 {q0}, [%2]! \n" // load B - "subs %4, %4, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void MergeXR30Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - int shift = 10 - depth; - asm volatile( - "vmov.u32 q14, #1023 \n" - "vdup.32 q15, %5 \n" - "1: \n" - "vld1.16 {d4}, [%2]! \n" // B - "vld1.16 {d2}, [%1]! \n" // G - "vld1.16 {d0}, [%0]! \n" // R - "vmovl.u16 q2, d4 \n" // B - "vmovl.u16 q1, d2 \n" // G - "vmovl.u16 q0, d0 \n" // R - "vshl.u32 q2, q2, q15 \n" // 000B - "vshl.u32 q1, q1, q15 \n" - "vshl.u32 q0, q0, q15 \n" - "vmin.u32 q2, q2, q14 \n" - "vmin.u32 q1, q1, q14 \n" - "vmin.u32 q0, q0, q14 \n" - "vsli.u32 q2, q1, #10 \n" // 00GB - "vsli.u32 q2, q0, #20 \n" // 0RGB - "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" - "vst1.8 {q2}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); -} - -void MergeXR30Row_10_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int /* depth */, - int width) { - asm volatile( - "vmov.u32 q14, #1023 \n" - "1: \n" - "vld1.16 {d4}, [%2]! \n" // B - "vld1.16 {d2}, [%1]! \n" // G - "vld1.16 {d0}, [%0]! \n" // R - "vmovl.u16 q2, d4 \n" // 000B - "vmovl.u16 q1, d2 \n" // G - "vmovl.u16 q0, d0 \n" // R - "vmin.u32 q2, q2, q14 \n" - "vmin.u32 q1, q1, q14 \n" - "vmin.u32 q0, q0, q14 \n" - "vsli.u32 q2, q1, #10 \n" // 00GB - "vsli.u32 q2, q0, #20 \n" // 0RGB - "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) - "subs %4, %4, #4 \n" - "vst1.8 {q2}, [%3]! \n" - "bgt 1b \n" - "3: \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "q0", "q1", "q2", "q14"); -} - -void MergeAR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile( - - "vdup.u16 q15, %6 \n" - "vdup.u16 q14, %7 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vld1.16 {q3}, [%3]! \n" // A - "vmin.u16 q2, q2, q14 \n" - "vmin.u16 q1, q1, q14 \n" - "vmin.u16 q0, q0, q14 \n" - "vmin.u16 q3, q3, q14 \n" - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vshl.u16 q3, q3, q15 \n" - "subs %5, %5, #8 \n" - "vst4.16 {d0, d2, d4, d6}, [%4]! \n" - "vst4.16 {d1, d3, d5, d7}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_ar64), // %4 - "+r"(width) // %5 - : "r"(shift), // %6 - "r"(mask) // %7 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeXR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile( - - "vmov.u8 q3, #0xff \n" // A (0xffff) - "vdup.u16 q15, %5 \n" - "vdup.u16 q14, %6 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vmin.u16 q2, q2, q14 \n" - "vmin.u16 q1, q1, q14 \n" - "vmin.u16 q0, q0, q14 \n" - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "subs %4, %4, #8 \n" - "vst4.16 {d0, d2, d4, d6}, [%3]! \n" - "vst4.16 {d1, d3, d5, d7}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar64), // %3 - "+r"(width) // %4 - : "r"(shift), // %5 - "r"(mask) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeARGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile( - - "vdup.16 q15, %6 \n" - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vld1.16 {q3}, [%3]! \n" // A - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vshl.u16 q3, q3, q15 \n" - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d1, q1 \n" - "vqmovn.u16 d2, q2 \n" - "vqmovn.u16 d3, q3 \n" - "subs %5, %5, #8 \n" - "vst4.8 {d0, d1, d2, d3}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : "r"(shift) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); -} - -void MergeXRGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile( - - "vdup.16 q15, %5 \n" - "vmov.u8 d6, #0xff \n" // A (0xff) - "1: \n" - "vld1.16 {q2}, [%0]! \n" // R - "vld1.16 {q1}, [%1]! \n" // G - "vld1.16 {q0}, [%2]! \n" // B - "vshl.u16 q2, q2, q15 \n" - "vshl.u16 q1, q1, q15 \n" - "vshl.u16 q0, q0, q15 \n" - "vqmovn.u16 d5, q2 \n" - "vqmovn.u16 d4, q1 \n" - "vqmovn.u16 d3, q0 \n" - "subs %4, %4, #8 \n" - "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); -} - -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// SetRow writes 'width' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile( - "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v8) // %2 - : "cc", "memory", "q0"); -} - -// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. -void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v32) // %2 - : "cc", "memory", "q0"); -} - -void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - // Start at end of source row. - "add %0, %0, %2 \n" - "sub %0, %0, #32 \n" // 32 bytes per loop - - "1: \n" - "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 - "subs %2, #32 \n" // 32 pixels per loop. - "vrev64.8 q0, q2 \n" - "vrev64.8 q1, q1 \n" - "vswp d0, d1 \n" - "vswp d2, d3 \n" - "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(-32) // %3 - : "cc", "memory", "q0", "q1", "q2"); -} - -void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %2, lsl #1 \n" - "sub %0, #16 \n" - - "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r12", "q0"); -} - -void MirrorSplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" - - "1: \n" - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1]! \n" // dst += 8 - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0"); -} - -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "add %0, %0, %2, lsl #2 \n" - "sub %0, #32 \n" - - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vrev64.8 d3, d3 \n" - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(-32) // %3 - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - src_rgb24 += width * 3 - 24; - asm volatile( - "1: \n" - "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 - "subs %2, #8 \n" // 8 pixels per loop. - "vrev64.8 d0, d0 \n" - "vrev64.8 d1, d1 \n" - "vrev64.8 d2, d2 \n" - "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "r"(-24) // %3 - : "cc", "memory", "d0", "d1", "d2"); -} - -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "vmov.u8 d0, #255 \n" // Alpha - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} -void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile( - "1: \n" - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of - // RGB24. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3" // Clobber List - ); -} - -#define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ - -void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -#define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ - -// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ - -void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -#define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ - -void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void ARGBToRGB24Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of - // RGB24. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile( - "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d1}, [%1]! \n" // store 8 U. - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.8 {d0}, [%1]! \n" // store 8 U. - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.8 {d1}, [%2]! \n" // store 8 U. - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.8 {d0}, [%2]! \n" // store 8 U. - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", - "d7" // Clobber List - ); -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile( - "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile( - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - "vld1.8 {d1}, [%1]! \n" // load 8 Us - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void I422ToUYVYRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile( - "1: \n" - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - "vld1.8 {d0}, [%1]! \n" // load 8 Us - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3"); -} - -void ARGBToRGB565Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb565, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "d6"); -} - -void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - asm volatile( - "vdup.32 d7, %2 \n" // dither4 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d7 \n" - "vqadd.u8 d2, d2, d7 \n" - "vqadd.u8 d4, d4, d7 \n" // add for dither - ARGBTORGB565 - "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. - "bgt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb1555, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb4444, - int width) { - asm volatile( - "vmov.u8 d7, #0x0f \n" // bits to clear with - // vbic. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d1, d24 \n" // B - "vmlal.u8 q2, d2, d25 \n" // G - "vmlal.u8 q2, d3, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -// 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 - // coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", - "q15"); -} - -// clang-format off -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ -// clang-format on - -// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void BGRAToUVRow_NEON(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q3, q2, q1) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(src_stride_bgra), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void ABGRToUVRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_stride_abgr), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RGBAToUVRow_NEON(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(src_stride_rgba), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_stride_rgb24), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void RAWToUVRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q2, q1, q0) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_stride_raw), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 - // coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - - "vrshr.u16 q0, q4, #1 \n" // 2x average - "vrshr.u16 q1, q5, #1 \n" - "vrshr.u16 q2, q6, #1 \n" - - "subs %4, %4, #16 \n" // 16 processed per loop. - RGBTOUV(q0, q1, q2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -} - -void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - asm volatile( - "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); -} - -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - -void ARGBToAR64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q2}, [%0]! \n" - "vmov.u8 q1, q0 \n" - "vmov.u8 q3, q2 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels - "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void ARGBToAB64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - "vld1.8 q4, %3 \n" // shuffler - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q2}, [%0]! \n" - "vtbl.8 d2, {d0, d1}, d8 \n" - "vtbl.8 d3, {d0, d1}, d9 \n" - "vtbl.8 d6, {d4, d5}, d8 \n" - "vtbl.8 d7, {d4, d5}, d9 \n" - "vmov.u8 q0, q1 \n" - "vmov.u8 q2, q3 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels - "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -void AR64ToARGBRow_NEON(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vld1.16 {q3}, [%0]! \n" - "vshrn.u16 d0, q0, #8 \n" - "vshrn.u16 d1, q1, #8 \n" - "vshrn.u16 d4, q2, #8 \n" - "vshrn.u16 d5, q3, #8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 4 pixels - "vst1.8 {q2}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; - -void AB64ToARGBRow_NEON(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile( - "vld1.8 d8, %3 \n" // shuffler - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vld1.16 {q3}, [%0]! \n" - "vtbl.8 d0, {d0, d1}, d8 \n" - "vtbl.8 d1, {d2, d3}, d8 \n" - "vtbl.8 d4, {d4, d5}, d8 \n" - "vtbl.8 d5, {d6, d7}, d8 \n" - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.8 {q0}, [%1]! \n" // store 4 pixels - "vst1.8 {q2}, [%1]! \n" // store 4 pixels - "bgt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAB64ToARGB) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); -} - -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); -} - -void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); -} - -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); -} - -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - asm volatile( - "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient - "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); -} - -void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - asm volatile( - "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R - "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); -} - -void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - asm volatile( - "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // R - "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // B - "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); -} - -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction) // %4 - : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); -} - -// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - "subs %3, #8 \n" - "blt 89f \n" - // Blend 8 pixels. - "8: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" - - "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" - - // Blend 1 pixels. - "1: \n" - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" - - "99: \n" - - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); -} - -// Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - // Attenuate 8 pixels. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); -} - -// Quantize 8 ARGB pixels (32 bytes). -// dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add - - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); -} - -// Shade 8 pixels at a time by specified value. -// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. -// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); -} - -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -// Similar to ARGBToYJ but stores ARGB. -// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; -void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient - "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient - "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); -} - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", - "q14", "q15"); -} - -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// TODO(fbarchard): Was same as Sepia except matrix is provided. This function -// needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile( - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - - "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); -} - -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile( - // 16 pixel loop. - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1"); -} - -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile( - "1: \n" - "vld1.8 {d0}, [%0],%5 \n" // top - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%2],%5 \n" // bottom - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile( - "1: \n" - "vld1.8 {d0}, [%0],%4 \n" // left - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - "vld1.8 {d2}, [%0],%5 \n" // right - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// %y passes a float as a scalar vector for vector * scalar multiply. -// the regoster must be d0 to d15 and indexed with [0] or [1] to access -// the float in the first or second float of the d-reg - -void HalfFloat1Row_NEON(const uint16_t* src, - uint16_t* dst, - float /*unused*/, - int width) { - asm volatile( - - "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(1.9259299444e-34f) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -void HalfFloatRow_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - asm volatile( - - "1: \n" - "vld1.8 {q1}, [%0]! \n" // load 8 shorts - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u16 q2, d2 \n" // 8 int's - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // adjust exponent - "vmul.f32 q3, q3, %y3 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat - "vqshrn.u32 d3, q3, #13 \n" - "vst1.8 {q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -void ByteToFloatRow_NEON(const uint8_t* src, - float* dst, - float scale, - int width) { - asm volatile( - - "1: \n" - "vld1.8 {d2}, [%0]! \n" // load 8 bytes - "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q1, d2 \n" // 8 shorts - "vmovl.u16 q2, d2 \n" // 8 ints - "vmovl.u16 q3, d3 \n" - "vcvt.f32.u32 q2, q2 \n" // 8 floats - "vcvt.f32.u32 q3, q3 \n" - "vmul.f32 q2, q2, %y3 \n" // scale - "vmul.f32 q3, q3, %y3 \n" - "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "q1", "q2", "q3"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_NEON(const uint16_t* src0, - const uint16_t* src1, - const uint16_t* src2, - const uint16_t* src3, - const uint16_t* src4, - uint32_t* dst, - int width) { - asm volatile( - "vmov.u16 d6, #4 \n" // constant 4 - "vmov.u16 d7, #6 \n" // constant 6 - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows - "vld1.16 {q2}, [%4]! \n" - "vaddl.u16 q0, d2, d4 \n" // * 1 - "vaddl.u16 q1, d3, d5 \n" // * 1 - "vld1.16 {q2}, [%1]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "vld1.16 {q2}, [%2]! \n" - "vmlal.u16 q0, d4, d7 \n" // * 6 - "vmlal.u16 q1, d5, d7 \n" // * 6 - "vld1.16 {q2}, [%3]! \n" - "vmlal.u16 q0, d4, d6 \n" // * 4 - "vmlal.u16 q1, d5, d6 \n" // * 4 - "subs %6, %6, #8 \n" // 8 processed per loop - "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples - "bgt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { - const uint32_t* src1 = src + 1; - const uint32_t* src2 = src + 2; - const uint32_t* src3 = src + 3; - asm volatile( - "vmov.u32 q10, #4 \n" // constant 4 - "vmov.u32 q11, #6 \n" // constant 6 - - "1: \n" - "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples - "vld1.32 {q2}, [%0] \n" - "vadd.u32 q0, q0, q1 \n" // * 1 - "vadd.u32 q1, q1, q2 \n" // * 1 - "vld1.32 {q2, q3}, [%2]! \n" - "vmla.u32 q0, q2, q11 \n" // * 6 - "vmla.u32 q1, q3, q11 \n" // * 6 - "vld1.32 {q2, q3}, [%1]! \n" - "vld1.32 {q8, q9}, [%3]! \n" - "vadd.u32 q2, q2, q8 \n" // add rows for * 4 - "vadd.u32 q3, q3, q9 \n" - "vmla.u32 q0, q2, q10 \n" // * 4 - "vmla.u32 q1, q3, q10 \n" // * 4 - "subs %5, %5, #8 \n" // 8 processed per loop - "vqshrn.u32 d0, q0, #8 \n" // round and pack - "vqshrn.u32 d1, q1, #8 \n" - "vst1.u16 {q0}, [%4]! \n" // store 8 samples - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} - -// Convert biplanar NV21 to packed YUV24 -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2"); -} - -void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV - // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV - // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV - // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average - "vqrshrun.s16 d0, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_stride_ayuv), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -} - -void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - asm volatile( - "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV - // pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV - // pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV - // pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average - "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_stride_ayuv), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -} - -// Copy row of AYUV Y's into Y. -// Similar to ARGBExtractAlphaRow_NEON -void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values - "vld2.8 {d1, d3}, [%0]! \n" - "vorr.u8 q2, q0, q0 \n" // move U after V - "subs %2, %2, #16 \n" // 16 pixels per loop - "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2"); -} - -void HalfMergeUVRow_NEON(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - const uint8_t* src_u_1 = src_u + src_stride_u; - const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 16 U values - "vld1.8 {q1}, [%2]! \n" // load 16 V values - "vld1.8 {q2}, [%1]! \n" - "vld1.8 {q3}, [%3]! \n" - "vpaddl.u8 q0, q0 \n" // half size - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q1, q3 \n" - "vqrshrn.u16 d0, q0, #2 \n" - "vqrshrn.u16 d1, q1, #2 \n" - "subs %5, %5, #16 \n" // 16 src pixels per loop - "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_u_1), // %1 - "+r"(src_v), // %2 - "+r"(src_v_1), // %3 - "+r"(dst_uv), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); -} - -void SplitUVRow_16_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - int shift = depth - 16; // Negative for right shift. - asm volatile( - "vdup.16 q2, %4 \n" - "1: \n" - "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV - "vshl.u16 q0, q0, q2 \n" - "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop - "vst1.16 {q0}, [%1]! \n" // store 8 U pixels - "vst1.16 {q1}, [%2]! \n" // store 8 V pixels - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - asm volatile( - "vdup.16 q2, %4 \n" - "1: \n" - "vld1.16 {q0}, [%0]! \n" // load 8 U - "vld1.16 {q1}, [%1]! \n" // load 8 V - "vshl.u16 q0, q0, q2 \n" - "vshl.u16 q1, q1, q2 \n" - "subs %3, %3, #8 \n" // 8 src pixels per loop - "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels - "bgt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2"); -} - -void MultiplyRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "vdup.16 q2, %2 \n" - "1: \n" - "vld1.16 {q0}, [%0]! \n" - "vld1.16 {q1}, [%0]! \n" - "vmul.u16 q0, q0, q2 \n" - "vmul.u16 q1, q1, q2 \n" - "vst1.16 {q0}, [%1]! \n" - "vst1.16 {q1}, [%1]! \n" - "subs %3, %3, #16 \n" // 16 src pixels per loop - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2"); -} - -void DivideRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "vdup.16 q0, %2 \n" - "1: \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q1, d3 \n" - "vmovl.u16 q4, d4 \n" - "vmovl.u16 q2, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vshl.u32 q1, q1, q0 \n" - "vshl.u32 q2, q2, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q1 \n" - "vmovn.u32 d4, q4 \n" - "vmovn.u32 d5, q2 \n" - "vst1.16 {q1}, [%1]! \n" - "vst1.16 {q2}, [%1]! \n" - "subs %3, %3, #16 \n" // 16 src pixels per loop - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_neon64.cc b/thirdparty/libyuv/source/row_neon64.cc deleted file mode 100644 index ba6ca5d..0000000 --- a/thirdparty/libyuv/source/row_neon64.cc +++ /dev/null @@ -1,3855 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// v0.8h: Y -// v1.16b: 8U, 8V - -// Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ld1 {v1.s}[0], [%[src_u]], #4 \n" \ - "ld1 {v1.s}[1], [%[src_v]], #4 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v1.16b, v1.16b, v1.16b \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" - -// Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ld1 {v1.d}[0], [%[src_u]], #8 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "ld1 {v1.d}[1], [%[src_v]], #8 \n" \ - "prfm pldl1keep, [%[src_u], 448] \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_v], 448] \n" - -// Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - "ldr d0, [%[src_y]], #8 \n" \ - "movi v1.16b, #128 \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" - -static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, - 1, 1, 3, 3, 5, 5, 7, 7}; -static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, - 0, 0, 2, 2, 4, 4, 6, 6}; - -// Read 8 Y and 4 UV from NV12 or NV21 -#define READNV12 \ - "ldr d0, [%[src_y]], #8 \n" \ - "ldr d1, [%[src_uv]], #8 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "tbl v1.16b, {v1.16b}, v2.16b \n" \ - "prfm pldl1keep, [%[src_uv], 448] \n" - -// Read 8 YUY2 -#define READYUY2 \ - "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_yuy2], 448] \n" \ - "tbl v1.16b, {v1.16b}, v2.16b \n" - -// Read 8 UYVY -#define READUYVY \ - "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ - "zip1 v0.16b, v4.16b, v4.16b \n" \ - "prfm pldl1keep, [%[src_uyvy], 448] \n" \ - "tbl v1.16b, {v3.16b}, v2.16b \n" - -// UB VR UG VG -// YG BB BG BR -#define YUVTORGB_SETUP \ - "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ - "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" - -// v16.8h: B -// v17.8h: G -// v18.8h: R - -// Convert from YUV to 2.14 fixed point RGB -#define YUVTORGB \ - "umull2 v3.4s, v0.8h, v24.8h \n" \ - "umull v6.8h, v1.8b, v30.8b \n" \ - "umull v0.4s, v0.4h, v24.4h \n" \ - "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ - "uqshrn v0.4h, v0.4s, #16 \n" \ - "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ - "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ - "add v17.8h, v0.8h, v26.8h \n" /* G */ \ - "add v16.8h, v0.8h, v4.8h \n" /* B */ \ - "add v18.8h, v0.8h, v5.8h \n" /* R */ \ - "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ - "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ - "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ - -// Convert from 2.14 fixed point RGB To 8 bit RGB -#define RGBTORGB8 \ - "uqshrn v17.8b, v17.8h, #6 \n" \ - "uqshrn v16.8b, v16.8h, #6 \n" \ - "uqshrn v18.8b, v18.8h, #6 \n" - -#define YUVTORGB_REGS \ - "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \ - "v26", "v27", "v28", "v29", "v30", "v31" - -void I444ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV444 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I444AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" - "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 - "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422AlphaToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" - "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 - "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [src_a] "+r"(src_a), // %[src_a] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void I422ToRGBARow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v15.8b, #255 \n" /* A */ - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v15"); -} - -void I422ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTORGB565 \ - "shll v18.8h, v18.8b, #8 \n" /* R */ \ - "shll v17.8h, v17.8b, #8 \n" /* G */ \ - "shll v16.8h, v16.8b, #8 \n" /* B */ \ - "sri v18.8h, v17.8h, #5 \n" /* RG */ \ - "sri v18.8h, v16.8h, #11 \n" /* RGB */ - -void I422ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 - "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS); -} - -#define ARGBTOARGB1555 \ - "shll v0.8h, v19.8b, #8 \n" /* A */ \ - "shll v18.8h, v18.8b, #8 \n" /* R */ \ - "shll v17.8h, v17.8b, #8 \n" /* G */ \ - "shll v16.8h, v16.8b, #8 \n" /* B */ \ - "sri v0.8h, v18.8h, #1 \n" /* AR */ \ - "sri v0.8h, v17.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v16.8h, #11 \n" /* ARGB */ - -void I422ToARGB1555Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555 - "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels - // RGB565. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -#define ARGBTOARGB4444 \ - /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \ - "ushr v16.8b, v16.8b, #4 \n" /* B */ \ - "bic v17.8b, v17.8b, v23.8b \n" /* G */ \ - "ushr v18.8b, v18.8b, #4 \n" /* R */ \ - "bic v19.8b, v19.8b, v23.8b \n" /* A */ \ - "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \ - "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \ - "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ - -void I422ToARGB4444Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v23.16b, #0x0f \n" // bits to clear with - // vbic. - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "movi v19.8b, #255 \n" ARGBTOARGB4444 - "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 - // pixels - // ARGB4444. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19", "v23"); -} - -void I400ToARGBRow_NEON(const uint8_t* src_y, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV400 YUVTORGB - RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] - : "cc", "memory", YUVTORGB_REGS, "v19"); -} - -void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "movi v23.8b, #255 \n" - "1: \n" - "ld1 {v20.8b}, [%0], #8 \n" - "prfm pldl1keep, [%0, 448] \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v20", "v21", "v22", "v23"); -} - -void NV12ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void NV21ToARGBRow_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_vu), // %[src_uv] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV21Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void NV12ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void NV21ToRGB24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_vu), // %[src_uv] - [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV21Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void NV12ToRGB565Row_NEON(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" ARGBTORGB565 - "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 - // pixels - // RGB565. - "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2"); -} - -void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" - : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) - : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); -} - -// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store U - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load U - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. -void SplitRGBRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store R - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%3], #16 \n" // store B - "b.gt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time -void MergeRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v2.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. -void SplitARGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w5, %w5, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%3], #16 \n" // store B - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%1], #16 \n" // store R - "st1 {v3.16b}, [%4], #16 \n" // store A - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time -void MergeARGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v0.16b}, [%2], #16 \n" // load B - "ld1 {v3.16b}, [%3], #16 \n" // load A - "subs %w5, %w5, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "prfm pldl1keep, [%3, 448] \n" - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. -void SplitXRGBRow_NEON(const uint8_t* src_rgba, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%3], #16 \n" // store B - "st1 {v1.16b}, [%2], #16 \n" // store G - "st1 {v2.16b}, [%1], #16 \n" // store R - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time -void MergeXRGBRow_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.16b, #255 \n" // load A(255) - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load R - "ld1 {v1.16b}, [%1], #16 \n" // load G - "ld1 {v0.16b}, [%2], #16 \n" // load B - "subs %w4, %w4, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void MergeXR30Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int depth, - int width) { - int shift = 10 - depth; - asm volatile( - "movi v30.16b, #255 \n" - "ushr v30.4s, v30.4s, #22 \n" // 1023 - "dup v31.4s, %w5 \n" - "1: \n" - "ldr d2, [%2], #8 \n" // B - "ldr d1, [%1], #8 \n" // G - "ldr d0, [%0], #8 \n" // R - "ushll v2.4s, v2.4h, #0 \n" // B - "ushll v1.4s, v1.4h, #0 \n" // G - "ushll v0.4s, v0.4h, #0 \n" // R - "ushl v2.4s, v2.4s, v31.4s \n" // 000B - "ushl v1.4s, v1.4s, v31.4s \n" // G - "ushl v0.4s, v0.4s, v31.4s \n" // R - "umin v2.4s, v2.4s, v30.4s \n" - "umin v1.4s, v1.4s, v30.4s \n" - "umin v0.4s, v0.4s, v30.4s \n" - "sli v2.4s, v1.4s, #10 \n" // 00GB - "sli v2.4s, v0.4s, #20 \n" // 0RGB - "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) - "subs %w4, %w4, #4 \n" - "str q2, [%3], #16 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); -} - -void MergeXR30Row_10_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, - int /* depth */, - int width) { - asm volatile( - "movi v30.16b, #255 \n" - "ushr v30.4s, v30.4s, #22 \n" // 1023 - "1: \n" - "ldr d2, [%2], #8 \n" // B - "ldr d1, [%1], #8 \n" // G - "ldr d0, [%0], #8 \n" // R - "ushll v2.4s, v2.4h, #0 \n" // 000B - "ushll v1.4s, v1.4h, #0 \n" // G - "ushll v0.4s, v0.4h, #0 \n" // R - "umin v2.4s, v2.4s, v30.4s \n" - "umin v1.4s, v1.4s, v30.4s \n" - "umin v0.4s, v0.4s, v30.4s \n" - "sli v2.4s, v1.4s, #10 \n" // 00GB - "sli v2.4s, v0.4s, #20 \n" // 0RGB - "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) - "subs %w4, %w4, #4 \n" - "str q2, [%3], #16 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar30), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "v0", "v1", "v2", "v30"); -} - -void MergeAR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile( - - "dup v30.8h, %w7 \n" - "dup v31.8h, %w6 \n" - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "ldr q3, [%3], #16 \n" // A - "umin v2.8h, v2.8h, v30.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "umin v1.8h, v1.8h, v30.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "umin v0.8h, v0.8h, v30.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "umin v3.8h, v3.8h, v30.8h \n" - "prfm pldl1keep, [%3, 448] \n" - "ushl v2.8h, v2.8h, v31.8h \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "ushl v3.8h, v3.8h, v31.8h \n" - "subs %w5, %w5, #8 \n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_ar64), // %4 - "+r"(width) // %5 - : "r"(shift), // %6 - "r"(mask) // %7 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeXR64Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, - int depth, - int width) { - int shift = 16 - depth; - int mask = (1 << depth) - 1; - asm volatile( - - "movi v3.16b, #0xff \n" // A (0xffff) - "dup v30.8h, %w6 \n" - "dup v31.8h, %w5 \n" - - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "umin v2.8h, v2.8h, v30.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "umin v1.8h, v1.8h, v30.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "umin v0.8h, v0.8h, v30.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "ushl v2.8h, v2.8h, v31.8h \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "subs %w4, %w4, #8 \n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_ar64), // %3 - "+r"(width) // %4 - : "r"(shift), // %5 - "r"(mask) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeARGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile( - - "dup v31.8h, %w6 \n" - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "ldr q3, [%3], #16 \n" // A - "ushl v2.8h, v2.8h, v31.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "ushl v3.8h, v3.8h, v31.8h \n" - "prfm pldl1keep, [%3, 448] \n" - "uqxtn v2.8b, v2.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v0.8b, v0.8h \n" - "uqxtn v3.8b, v3.8h \n" - "subs %w5, %w5, #8 \n" - "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : "r"(shift) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -void MergeXRGB16To8Row_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, - int depth, - int width) { - int shift = 8 - depth; - asm volatile( - - "dup v31.8h, %w5 \n" - "movi v3.8b, #0xff \n" // A (0xff) - "1: \n" - "ldr q2, [%0], #16 \n" // R - "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "ushl v2.8h, v2.8h, v31.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v31.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "ushl v0.8h, v0.8h, v31.8h \n" - "prfm pldl1keep, [%2, 448] \n" - "uqxtn v2.8b, v2.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v0.8b, v0.8h \n" - "subs %w4, %w4, #8 \n" - "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : "r"(shift) // %5 - : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); -} - -// Copy multiple of 32. -void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #32 \n" // 32 processed per loop - "stp q0, q1, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -// SetRow writes 'width' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile( - "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v8) // %2 - : "cc", "memory", "v0"); -} - -void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(width) // %1 - : "r"(v32) // %2 - : "cc", "memory", "v0"); -} - -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - // Start at end of source row. - "ld1 {v3.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q2, [%0, 16] \n" - "ldr q1, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #32 \n" // 32 pixels per loop. - "tbl v0.16b, {v2.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirror) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Shuffle table for reversing the UV. -static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, - 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; - -void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile( - // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirrorUV) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void MirrorSplitUVRow_NEON(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - // Start at end of source row. - "ld1 {v4.16b}, [%4] \n" // shuffler - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w3, %w3, #16 \n" // 16 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "uzp1 v0.16b, v2.16b, v3.16b \n" // U - "uzp2 v1.16b, v2.16b, v3.16b \n" // V - "st1 {v0.16b}, [%1], #16 \n" // dst += 16 - "st1 {v1.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(&kShuffleMirrorUV) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -// Shuffle table for reversing the ARGB. -static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, - 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; - -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - // Start at end of source row. - "ld1 {v4.16b}, [%3] \n" // shuffler - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #32 \n" - "1: \n" - "ldr q1, [%0, 16] \n" - "ldr q0, [%0], -32 \n" // src -= 32 - "subs %w2, %w2, #8 \n" // 8 pixels per loop. - "tbl v2.16b, {v1.16b}, v4.16b \n" - "tbl v3.16b, {v0.16b}, v4.16b \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(&kShuffleMirrorARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "ld1 {v3.16b}, [%4] \n" // shuffler - "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #48 \n" - - "1: \n" - "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "tbl v0.16b, {v0.16b}, v3.16b \n" - "tbl v1.16b, {v1.16b}, v3.16b \n" - "tbl v2.16b, {v2.16b}, v3.16b \n" - "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-48), // %3 - "r"(&kShuffleMirror) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v4.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of - // RGB24. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "movi v5.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "movi v0.8b, #255 \n" // Alpha - "1: \n" - "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "orr v1.8b, v5.8b, v5.8b \n" // move r - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile( - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "prfm pldl1keep, [%0, 448] \n" - "orr v4.8b, v0.8b, v0.8b \n" // move r - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -#define RGB565TOARGB \ - "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ - "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ - "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ - "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ - "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ - "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ - "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ - "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ - "dup v2.2D, v0.D[1] \n" /* R */ - -void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); -} - -#define ARGB1555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ - \ - "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ - "xtn2 v3.16b, v2.8h \n" \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ - "dup v1.2D, v0.D[1] \n" \ - "dup v3.2D, v2.D[1] \n" - -// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ - "dup v1.2D, v0.D[1] \n" /* G */ - -void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b -// clobbers v3 -#define ARGB4444TOARGB \ - "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ - "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ - "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ - "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ - "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ - "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ - "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ - "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ - "dup v0.2D, v2.D[1] \n" \ - "dup v1.2D, v3.D[1] \n" - -void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRGB24Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of - // RGB24 - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile( - "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "prfm pldl1keep, [%0, 448] \n" - "orr v5.8b, v1.8b, v1.8b \n" // mov b - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - "prfm pldl1keep, [%0, 448] \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "prfm pldl1keep, [%0, 448] \n" - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7" // Clobber List - ); -} - -void UYVYToUVRow_NEON(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "prfm pldl1keep, [%0, 448] \n" - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(src_uyvyb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v7" // Clobber List - ); -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "prfm pldl1keep, [%0, 448] \n" - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); -} - -void I422ToYUY2Row_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile( - "1: \n" - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "subs %w4, %w4, #16 \n" // 16 pixels - "orr v2.8b, v1.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void I422ToUYVYRow_NEON(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile( - "1: \n" - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void ARGBToRGB565Row_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb565, - int width) { - asm volatile( - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 - "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v16", "v17", "v18", "v19"); -} - -void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - asm volatile( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8 - // pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v16.8b, v16.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uqadd v17.8b, v17.8b, v1.8b \n" - "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 - "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); -} - -void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb1555, - int width) { - asm volatile( - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v16", "v17", "v18", "v19"); -} - -void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_argb4444, - int width) { - asm volatile( - "movi v23.16b, #0x0f \n" // bits to clear with - // vbic. - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); -} - -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - -void ARGBToAR64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - "1: \n" - "ldp q0, q2, [%0], #32 \n" // load 8 pixels - "mov v1.16b, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels - "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void ARGBToAB64Row_NEON(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - "ld1 {v4.16b}, %3 \n" // shuffler - "1: \n" - "ldp q0, q2, [%0], #32 \n" // load 8 pixels - "tbl v0.16b, {v0.16b}, v4.16b \n" - "tbl v2.16b, {v2.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "mov v1.16b, v0.16b \n" - "mov v3.16b, v2.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels - "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, - 17, 19, 21, 23, 25, 27, 29, 31}; - -void AR64ToARGBRow_NEON(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - "ld1 {v4.16b}, %3 \n" // shuffler - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 4 pixels - "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "stp q0, q2, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAR64ToARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, - 21, 19, 17, 23, 29, 27, 25, 31}; - -void AB64ToARGBRow_NEON(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile( - "ld1 {v4.16b}, %3 \n" // shuffler - "1: \n" - "ldp q0, q1, [%0], #32 \n" // load 4 pixels - "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "stp q0, q2, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAB64ToARGB) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v1.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v2.8b, v5.8b \n" // G - "umlal v0.8h, v3.8b, v6.8b \n" // R - "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movi v24.8b, #112 \n" // UB / VR 0.875 - // coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", - "v27", "v28", "v29"); -} - -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -// clang-format off -#define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ -// clang-format on - -// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -// TODO(fbarchard): consider ptrdiff_t for all strides. - -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void ARGBToUVJRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 - "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 - "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 - "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 - "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void BGRAToUVRow_NEON(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v3.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(src_bgra_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void ABGRToUVRow_NEON(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v3.8h, #1 \n" // 2x average - "urshr v2.8h, v2.8h, #1 \n" - "urshr v1.8h, v1.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v0.8h, v2.8h, v1.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(src_abgr_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RGBAToUVRow_NEON(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(src_rgba_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(src_rgb24_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -void RAWToUVRow_NEON(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_raw_1 = src_raw + src_stride_raw; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - - "urshr v2.8h, v2.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v0.8h, v0.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 32 processed per loop. - RGBTOUV(v2.8h, v1.8h, v0.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(src_raw_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} - -// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile( - RGBTOUV_SETUP_REG - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile( - RGBTOUV_SETUP_REG - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_argb1555_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28"); -} - -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile( - RGBTOUV_SETUP_REG // sets v20-v25 - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" - - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" - - "subs %w4, %w4, #16 \n" // 16 processed per loop. - RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_argb4444_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", - "v28" - - ); -} - -void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", - "v27"); -} - -void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - asm volatile( - "movi v24.8b, #25 \n" // B * 0.1016 coefficient - "movi v25.8b, #129 \n" // G * 0.5078 coefficient - "movi v26.8b, #66 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); -} - -void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - asm volatile( - "movi v4.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v6.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - asm volatile( - "movi v6.8b, #25 \n" // B * 0.1016 coefficient - "movi v5.8b, #129 \n" // G * 0.5078 coefficient - "movi v4.8b, #66 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - asm volatile( - "movi v4.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v6.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - asm volatile( - "movi v6.8b, #29 \n" // B * 0.1140 coefficient - "movi v5.8b, #150 \n" // G * 0.5870 coefficient - "movi v4.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // R - "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_yj), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction), // %4 - "+r"(y0_fraction) // %5 - : - : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); -} - -// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" - // Blend 8 pixels. - "8: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "prfm pldl1keep, [%0, 448] \n" - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "prfm pldl1keep, [%1, 448] \n" - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels - "b.ge 8b \n" - - "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" - - // Blend 1 pixels. - "1: \n" - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel - // ARGB0. - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel - // ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "prfm pldl1keep, [%0, 448] \n" - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "prfm pldl1keep, [%1, 448] \n" - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" - - "99: \n" - - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18"); -} - -// Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - // Attenuate 8 pixels. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "prfm pldl1keep, [%0, 448] \n" - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// Quantize 8 ARGB pixels (32 bytes). -// dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add - - // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "prfm pldl1keep, [%0, 448] \n" - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); -} - -// Shade 8 pixels at a time by specified value. -// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. -// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. - - // 8 pixel loop. - "1: \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "prfm pldl1keep, [%0, 448] \n" - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); -} - -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -// Similar to ARGBToYJ but stores ARGB. -// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; -void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movi v24.8b, #29 \n" // B * 0.1140 coefficient - "movi v25.8b, #150 \n" // G * 0.5870 coefficient - "movi v26.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); -} - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 - -void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "prfm pldl1keep, [%0, 448] \n" - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); -} - -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// TODO(fbarchard): Was same as Sepia except matrix is provided. This function -// needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile( - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "prfm pldl1keep, [%0, 448] \n" - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v22", "v23", "v24", "v25"); -} - -// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "prfm pldl1keep, [%0, 448] \n" - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "prfm pldl1keep, [%1, 448] \n" - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "prfm pldl1keep, [%0, 448] \n" - "orr v1.8b, v0.8b, v0.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "orr v2.8b, v0.8b, v0.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile( - // 16 pixel loop. - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "uqadd v0.16b, v0.16b, v1.16b \n" // add - "prfm pldl1keep, [%1, 448] \n" - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1"); -} - -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_NEON(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - "uqadd v1.8b, v0.8b, v2.8b \n" // add - "prfm pldl1keep, [%1, 448] \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.8b}, [%0],%5 \n" // top - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "prfm pldl1keep, [%1, 448] \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%2],%5 \n" // bottom - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "prfm pldl1keep, [%2, 448] \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2LL), // %5 - "r"(6LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_NEON(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile( - "1: \n" - "ld1 {v0.8b}, [%0],%4 \n" // left - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - "ld1 {v2.8b}, [%0],%5 \n" // right - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "add v0.8h, v0.8h, v1.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1LL), // %4 - "r"(6LL) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Caveat - rounds float to half float whereas scaling version truncates. -void HalfFloat1Row_NEON(const uint16_t* src, - uint16_t* dst, - float /*unused*/, - int width) { - asm volatile( - "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3"); -} - -void HalfFloatRow_NEON(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - asm volatile( - "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v3"); -} - -void ByteToFloatRow_NEON(const uint8_t* src, - float* dst, - float scale, - int width) { - asm volatile( - "1: \n" - "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v1.8h, v1.8b \n" // 8 shorts - "prfm pldl1keep, [%0, 448] \n" - "uxtl v2.4s, v1.4h \n" // 8 ints - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v3.4s, v3.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "fmul v3.4s, v3.4s, %3.s[0] \n" - "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "v1", "v2", "v3"); -} - -float ScaleMaxSamples_NEON(const float* src, - float* dst, - float scale, - int width) { - float fmax; - asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "prfm pldl1keep, [%0, 448] \n" - "fmul v4.4s, v2.4s, %4.s[0] \n" // scale - "fmax v5.4s, v5.4s, v1.4s \n" // max - "fmax v6.4s, v6.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "fmax v5.4s, v5.4s, v6.4s \n" // max - "fmaxv %s3, v5.4s \n" // signed max acculator - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fmax) // %3 - : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); - return fmax; -} - -float ScaleSumSamples_NEON(const float* src, - float* dst, - float scale, - int width) { - float fsum; - asm volatile( - "movi v5.4s, #0 \n" // max - "movi v6.4s, #0 \n" // max - - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v3.4s, v1.4s, %4.s[0] \n" // scale - "prfm pldl1keep, [%0, 448] \n" - "fmul v4.4s, v2.4s, %4.s[0] \n" - "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares - "fmla v6.4s, v2.4s, v2.4s \n" - "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - "faddp v5.4s, v5.4s, v6.4s \n" - "faddp v5.4s, v5.4s, v5.4s \n" - "faddp %3.4s, v5.4s, v5.4s \n" // sum - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width), // %2 - "=w"(fsum) // %3 - : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); - return fsum; -} - -void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { - asm volatile( - "1: \n" - "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %3.s[0] \n" // scale - "fmul v2.4s, v2.4s, %3.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale) // %3 - : "cc", "memory", "v1", "v2"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_NEON(const uint16_t* src0, - const uint16_t* src1, - const uint16_t* src2, - const uint16_t* src3, - const uint16_t* src4, - uint32_t* dst, - int width) { - asm volatile( - "movi v6.8h, #4 \n" // constant 4 - "movi v7.8h, #6 \n" // constant 6 - - "1: \n" - "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows - "ld1 {v2.8h}, [%4], #16 \n" - "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 - "prfm pldl1keep, [%0, 448] \n" - "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 - "ld1 {v2.8h}, [%1], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%1, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "ld1 {v2.8h}, [%2], #16 \n" - "umlal v0.4s, v2.4h, v7.4h \n" // * 6 - "prfm pldl1keep, [%2, 448] \n" - "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 - "ld1 {v2.8h}, [%3], #16 \n" - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "prfm pldl1keep, [%3, 448] \n" - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples - "prfm pldl1keep, [%4, 448] \n" - "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : - : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { - const uint32_t* src1 = src + 1; - const uint32_t* src2 = src + 2; - const uint32_t* src3 = src + 3; - asm volatile( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "add v1.4s, v1.4s, v2.4s \n" // * 1 - "ld1 {v2.4s,v3.4s}, [%2], #32 \n" - "mla v0.4s, v2.4s, v7.4s \n" // * 6 - "mla v1.4s, v3.4s, v7.4s \n" // * 6 - "ld1 {v2.4s,v3.4s}, [%1], #32 \n" - "ld1 {v4.4s,v5.4s}, [%3], #32 \n" - "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 - "add v3.4s, v3.4s, v5.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "subs %w5, %w5, #8 \n" // 8 processed per loop - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - "uqrshrn2 v0.8h, v1.4s, #8 \n" - "st1 {v0.8h}, [%4], #16 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(width) // %5 - : "r"(32LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_F32_NEON(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, - int width) { - asm volatile( - "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 - - "1: \n" - "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows - "ld1 {v2.4s, v3.4s}, [%1], #32 \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%2], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%3], #32 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "prfm pldl1keep, [%1, 448] \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "ld1 {v4.4s, v5.4s}, [%4], #32 \n" - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%2, 448] \n" - "fadd v0.4s, v0.4s, v4.4s \n" // * 1 - "prfm pldl1keep, [%3, 448] \n" - "fadd v1.4s, v1.4s, v5.4s \n" - "prfm pldl1keep, [%4, 448] \n" - "subs %w6, %w6, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : "r"(&kGaussCoefficients) // %7 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_F32_NEON(const float* src, float* dst, int width) { - asm volatile( - "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 - - "1: \n" - "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 - // rows - "fadd v0.4s, v0.4s, v1.4s \n" // * 1 - "ld1 {v4.4s, v5.4s}, [%0], %5 \n" - "fadd v1.4s, v1.4s, v2.4s \n" - "fmla v0.4s, v4.4s, v7.4s \n" // * 6 - "ld1 {v2.4s, v3.4s}, [%0], %4 \n" - "fmla v1.4s, v5.4s, v7.4s \n" - "ld1 {v4.4s, v5.4s}, [%0], %6 \n" - "fadd v2.4s, v2.4s, v4.4s \n" - "fadd v3.4s, v3.4s, v5.4s \n" - "fmla v0.4s, v2.4s, v6.4s \n" // * 4 - "fmla v1.4s, v3.4s, v6.4s \n" - "prfm pldl1keep, [%0, 448] \n" - "fmul v0.4s, v0.4s, v8.4s \n" // / 256 - "fmul v1.4s, v1.4s, v8.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kGaussCoefficients), // %3 - "r"(8LL), // %4 - "r"(-4LL), // %5 - "r"(20LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); -} - -// Convert biplanar NV21 to packed YUV24 -void NV21ToYUV24Row_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "prfm pldl1keep, [%0, 448] \n" - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2"); -} - -// AYUV is YVUA in memory. UV for NV12 is UV order in memory. -void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_uv, - int width) { - const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile( - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v2.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int src_stride_ayuv, - uint8_t* dst_vu, - int width) { - const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile( - - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -// Copy row of AYUV Y's into Y -void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "prfm pldl1keep, [%0, 448] \n" - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -// Shuffle table for swapping UV bytes. -static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, - 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; - -// Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values - "ld1 {v1.16b}, [%0], 16 \n" - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "tbl v0.16b, {v0.16b}, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" - "tbl v1.16b, {v1.16b}, v2.16b \n" - "stp q0, q1, [%1], 32 \n" // store 16 VU pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "r"(&kShuffleSwapUV) // %3 - : "cc", "memory", "v0", "v1", "v2"); -} - -void HalfMergeUVRow_NEON(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - const uint8_t* src_u_1 = src_u + src_stride_u; - const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values - "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values - "ld1 {v2.16b}, [%1], #16 \n" - "ld1 {v3.16b}, [%3], #16 \n" - "uaddlp v0.8h, v0.16b \n" // half size - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v1.8h, v3.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uqrshrn v0.8b, v0.8h, #2 \n" - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w5, %w5, #16 \n" // 16 src pixels per loop - "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_u_1), // %1 - "+r"(src_v), // %2 - "+r"(src_v_1), // %3 - "+r"(dst_uv), // %4 - "+r"(width) // %5 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); -} - -void SplitUVRow_16_NEON(const uint16_t* src_uv, - uint16_t* dst_u, - uint16_t* dst_v, - int depth, - int width) { - int shift = depth - 16; // Negative for right shift. - asm volatile( - "dup v2.8h, %w4 \n" - "1: \n" - "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "ushl v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v2.8h \n" - "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels - "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "v0", "v1", "v2"); -} - -void MergeUVRow_16_NEON(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width) { - int shift = 16 - depth; - asm volatile( - "dup v2.8h, %w4 \n" - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "subs %w3, %w3, #8 \n" // 8 src pixels per loop - "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "ushl v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v2.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels - "b.gt 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(shift) // %4 - : "cc", "memory", "v0", "v1", "v2"); -} - -void MultiplyRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "dup v2.8h, %w2 \n" - "1: \n" - "ldp q0, q1, [%0], #32 \n" - "mul v0.8h, v0.8h, v2.8h \n" - "prfm pldl1keep, [%0, 448] \n" - "mul v1.8h, v1.8h, v2.8h \n" - "stp q0, q1, [%1] \n" // store 16 pixels - "add %1, %1, #32 \n" - "subs %w3, %w3, #16 \n" // 16 src pixels per loop - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2"); -} - -void DivideRow_16_NEON(const uint16_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "dup v0.8h, %w2 \n" - "1: \n" - "ldp q1, q2, [%0], #32 \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll v4.4s, v2.4h, #0 \n" - "prfm pldl1keep, [%0, 448] \n" - "ushll2 v1.4s, v1.8h, #0 \n" - "ushll2 v2.4s, v2.8h, #0 \n" - "mul v3.4s, v0.4s, v3.4s \n" - "mul v4.4s, v0.4s, v4.4s \n" - "mul v1.4s, v0.4s, v1.4s \n" - "mul v2.4s, v0.4s, v2.4s \n" - "shrn v3.4h, v3.4s, #16 \n" - "shrn v4.4h, v4.4s, #16 \n" - "shrn2 v3.8h, v1.4s, #16 \n" - "shrn2 v4.8h, v2.4s, #16 \n" - "stp q3, q3, [%1] \n" // store 16 pixels - "add %1, %1, #32 \n" - "subs %w3, %w3, #16 \n" // 16 src pixels per loop - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/row_win.cc b/thirdparty/libyuv/source/row_win.cc deleted file mode 100644 index 2c3241c..0000000 --- a/thirdparty/libyuv/source/row_win.cc +++ /dev/null @@ -1,6404 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -// This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) - -#if defined(_M_X64) -#include -#include // For _mm_maddubs_epi16 -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// 64 bit -#if defined(_M_X64) - -// Read 8 UV from 444 -#define READYUV444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 8 UV from 444, With 8 Alpha. -#define READYUVA444 \ - xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ - xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - u_buf += 8; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ - xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ - xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ - xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ - xmm0 = _mm_adds_epi16(xmm4, xmm0); \ - xmm1 = _mm_subs_epi16(xmm4, xmm1); \ - xmm2 = _mm_adds_epi16(xmm4, xmm2); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); - -// Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ - dst_argb += 32; - -#if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; - const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; - while (width > 0) { - READYUVA444 - YUVTORGB(yuvconstants) - STOREARGB - width -= 8; - } -} -#endif - -// 32 bit -#else // defined(_M_X64) -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Constants for ARGB. -static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; - -// JPeg full range. -static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0}; - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; - -static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; - -static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; - -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; - -// Constants for BGRA. -static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, - 0, 33, 65, 13, 0, 33, 65, 13}; - -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - -// Constants for ABGR. -static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, - 33, 65, 13, 0, 33, 65, 13, 0}; - -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - -// Constants for RGBA. -static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, - 0, 13, 65, 33, 0, 13, 65, 33}; - -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - -static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; - -// 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; - -// 8 bit fixed point 0.5, for bias of UV. -static const ulvec8 kBiasUV128 = { - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Shuffle table for converting RGB24 to ARGB. -static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; - -// Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; - -// Shuffle table for converting RAW to RGB24. First 8. -static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Middle 8. -static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting RAW to RGB24. Last 8. -static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RGB24. -static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGB to RAW. -static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; - -// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; - -// YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, - 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, - 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - -// YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, - 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, - 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; - -// UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, - 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, - 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - -// UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, - 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, - 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; - -// NV21 shuf 8 VU to 16 UV. -static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, -}; - -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_J400TOARGBROW_AVX2 -// Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 - vpunpcklbw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - vpunpckhwd ymm1, ymm0, ymm0 - vpunpcklwd ymm0, ymm0, ymm0 - vpor ymm0, ymm0, ymm5 - vpor ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_J400TOARGBROW_AVX2 - -__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm4 - por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm4 - movdqu [edx + 32], xmm2 - por xmm0, xmm5 - pshufb xmm1, xmm4 - movdqu [edx], xmm0 - por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm4 - movdqu [edx + 16], xmm1 - por xmm3, xmm5 - movdqu [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 - mov ecx, [esp + 12] // width - movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 - movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 - movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 4] - movdqu xmm2, [eax + 8] - lea eax, [eax + 24] - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 8 - jg convertloop - ret - } -} - -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -// 20 instructions. -__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green - psllw xmm4, 10 - psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -#ifdef HAS_RGB565TOARGBROW_AVX2 -// pmul method to replicate bits. -// Math to replicate bits: -// (v << 8) | (v << 3) -// v * 256 + v * 8 -// v * (256 + 8) -// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green - vpsllw ymm4, ymm4, 10 - vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_RGB565TOARGBROW_AVX2 - -#ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - vmovd xmm5, eax - vbroadcastss ymm5, xmm5 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red - vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha - vpsllw ymm7, ymm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) - vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm1, ymm1, 0xd8 - vpunpckhbw ymm2, ymm1, ymm0 - vpunpcklbw ymm1, ymm1, ymm0 - vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB1555TOARGBROW_AVX2 - -#ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles - vpsrlw ymm3, ymm2, 4 - vpsllw ymm1, ymm0, 4 - vpor ymm2, ymm2, ymm3 - vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack - vpermq ymm2, ymm2, 0xd8 - vpunpckhbw ymm1, ymm0, ymm2 - vpunpcklbw ymm0, ymm0, ymm2 - vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB - vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB - lea eax, [eax + 32] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGB4444TOARGBROW_AVX2 - -// 24 instructions -__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x01080108 // generate multiplier to repeat 5 bits - movd xmm5, eax - pshufd xmm5, xmm5, 0 - mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - movd xmm6, eax - pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red - psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green - psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha - psllw xmm7, 8 - - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) - psllw xmm1, 8 - por xmm1, xmm2 // RB - movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) - pand xmm2, xmm7 - por xmm0, xmm2 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -// 18 instructions. -__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f - movd xmm4, eax - pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles - pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 - movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - psllw xmm1, 4 - psrlw xmm3, 4 - por xmm0, xmm1 - por xmm2, xmm3 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB - lea eax, [eax + 16] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW - - convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB - pshufb xmm1, xmm6 - pshufb xmm2, xmm6 - pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 - lea edx, [edx + 48] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - __asm { - - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes - movdqa xmm7, xmm6 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f - psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 - psrld xmm4, 26 - pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 - pslld xmm5, 11 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes - vpermq ymm6, ymm6, 0xd8 - vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565DITHERROW_AVX2 - -// TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f - psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 - pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 - pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 - pslld xmm7, 15 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA - packssdw xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 - psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 - psrlw xmm3, 8 - - convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble - psrld xmm0, 4 - psrld xmm1, 8 - por xmm0, xmm1 - packuswb xmm0, xmm0 - lea eax, [eax + 16] - movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 - lea edx, [edx + 8] - sub ecx, 4 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f - vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 - vpsrld ymm4, ymm4, 26 - vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR - vpackusdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTORGB565ROW_AVX2 - -#ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 - vpslld ymm7, ymm7, 15 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA - vpackssdw ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB1555ROW_AVX2 - -#ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 - vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 - - convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble - vpsrld ymm1, ymm1, 8 - vpsrld ymm0, ymm0, 4 - vpor ymm0, ymm0, ymm1 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 - lea edx, [edx + 16] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOARGB4444ROW_AVX2 - -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 // Add .5 for rounding. - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -#ifdef HAS_ARGBTOYROW_AVX2 -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; - -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToY - vbroadcastf128 ymm5, xmmword ptr kAddY16 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 // add 16 for Y - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYROW_AVX2 - -#ifdef HAS_ARGBTOYJROW_AVX2 -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - vbroadcastf128 ymm4, xmmword ptr kARGBToYJ - vbroadcastf128 ymm5, xmmword ptr kAddYJ64 - vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 // mutates. - vphaddw ymm2, ymm2, ymm3 - vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. - vpaddw ymm2, ymm2, ymm5 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 // mutates. - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBTOYJROW_AVX2 - -__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kBGRAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kABGRToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kRGBAToY - movdqa xmm5, xmmword ptr kAddY16 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToVJ - movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToV - vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - vpaddb ymm0, ymm0, ymm5 // -> unsigned - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kBiasUV128 - vbroadcastf128 ymm6, xmmword ptr kARGBToVJ - vbroadcastf128 ymm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 // mutated by vshufps - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 // mutates - vphaddw ymm0, ymm0, ymm2 - vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned - vpaddw ymm0, ymm0, ymm5 - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 // mutates - vpermq ymm0, ymm0, 0xd8 // For vpacksswb - vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBTOUVJROW_AVX2 - -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kARGBToV - movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v - - convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - movdqu [edx], xmm0 - - movdqu xmm0, [eax] // V - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqu [edx + edi], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kBGRAToV - movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kABGRToV - movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kBiasUV128 - movdqa xmm6, xmmword ptr kRGBAToV - movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v - - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm1, [eax + 16] - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm2, [eax + 32] - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBTOYROW_SSSE3 - -// Read 16 UV from 444 -#define READYUV444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 16 UV from 444. With 16 Alpha. -#define READYUVA444_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - __asm { \ - __asm vmovq xmm3, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ - __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16]} - -// Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - __asm { \ - __asm vmovdqu xmm3, [esi] /* UV */ \ - __asm lea esi, [esi + 16] \ - __asm vpermq ymm3, ymm3, 0xd8 \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16]} - -// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32]} - -// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ - __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm3, [eax] /* UV */ \ - __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32]} - -// Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ - __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ - __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ - __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ - __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ - __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ - __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ - __asm vpaddw ymm4, ymm3, ymm4 \ - __asm vpaddsw ymm0, ymm0, ymm4 \ - __asm vpsubsw ymm1, ymm4, ymm1 \ - __asm vpaddsw ymm2, ymm2, ymm4 \ - __asm vpsraw ymm0, ymm0, 6 \ - __asm vpsraw ymm1, ymm1, 6 \ - __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 \ - __asm vpackuswb ymm1, ymm1, ymm1 \ - __asm vpackuswb ymm2, ymm2, ymm2 \ - } - -// Store 16 ARGB values. -#define STOREARGB_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ - __asm vmovdqu 0[edx], ymm1 \ - __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64]} - -// Store 16 RGBA values. -#define STORERGBA_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ - __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ - __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ - __asm vmovdqu [edx], ymm0 \ - __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64]} - -#ifdef HAS_I422TOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I422ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TOARGBROW_AVX2 - -#ifdef HAS_I422ALPHATOARGBROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) void I422AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422ALPHATOARGBROW_AVX2 - -#ifdef HAS_I444TOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - convertloop: - READYUV444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444TOARGBROW_AVX2 - -#ifdef HAS_I444ALPHATOARGBROW_AVX2 -// 16 pixels -// 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void I444AlphaToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - convertloop: - READYUVA444_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I444AlphaTOARGBROW_AVX2 - -#ifdef HAS_NV12TOARGBROW_AVX2 -// 16 pixels. -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV12ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV12_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV12TOARGBROW_AVX2 - -#ifdef HAS_NV21TOARGBROW_AVX2 -// 16 pixels. -// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) void NV21ToARGBRow_AVX2( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READNV21_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop esi - vzeroupper - ret - } -} -#endif // HAS_NV21TOARGBROW_AVX2 - -#ifdef HAS_YUY2TOARGBROW_AVX2 -// 16 pixels. -// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void YUY2ToARGBRow_AVX2( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUY2_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_YUY2TOARGBROW_AVX2 - -#ifdef HAS_UYVYTOARGBROW_AVX2 -// 16 pixels. -// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) void UYVYToARGBRow_AVX2( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READUYVY_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - vzeroupper - ret - } -} -#endif // HAS_UYVYTOARGBROW_AVX2 - -#ifdef HAS_I422TORGBAROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) void I422ToRGBARow_AVX2( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV422_AVX2 - YUVTORGB_AVX2(ebx) - STORERGBA_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422TORGBAROW_AVX2 - -#if defined(HAS_I422TOARGBROW_SSSE3) -// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. -// Allows a conversion with half size scaling. - -// Read 8 UV from 444. -#define READYUV444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 444. With 8 Alpha. -#define READYUVA444 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* U */ \ - __asm movq xmm1, qword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - __asm { \ - __asm movd xmm3, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm3, xmm1 /* UV */ \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8]} - -// Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 \ - __asm { \ - __asm movq xmm3, qword ptr [esi] /* UV */ \ - __asm lea esi, [esi + 8] \ - __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8]} - -// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 \ - __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ - __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16]} - -// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY \ - __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ - __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm3, [eax] /* UV */ \ - __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16]} - -// Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) \ - __asm { \ - __asm psubb xmm3, xmmword ptr kBiasUV128 \ - __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ - __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ - __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ - __asm pmaddubsw xmm0, xmm3 \ - __asm pmaddubsw xmm1, xmm3 \ - __asm pmaddubsw xmm2, xmm3 \ - __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ - __asm paddw xmm4, xmm3 \ - __asm paddsw xmm0, xmm4 \ - __asm paddsw xmm2, xmm4 \ - __asm psubsw xmm4, xmm1 \ - __asm movdqa xmm1, xmm4 \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ - } - -// Store 8 ARGB values. -#define STOREARGB \ - __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm0 \ - __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32]} - -// Store 8 BGRA values. -#define STOREBGRA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGBA values. -#define STORERGBA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ - __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ - __asm movdqu 0[edx], xmm5 \ - __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32]} - -// Store 8 RGB24 values. -#define STORERGB24 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24]} - -// Store 8 RGB565 values. -#define STORERGB565 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ - __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ - __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16]} - -// 8 pixels. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I444ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). -__declspec(naked) void I444AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA444 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) void I422ToRGB24Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 - movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB24 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) void I422ToRGB565Row_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f - psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 - psrld xmm6, 26 - pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 - pslld xmm7, 11 - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGB565 - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void I422ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) void I422AlphaToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // argb - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV12ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV12 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) void NV21ToARGBRow_SSSE3( - const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU - mov edx, [esp + 8 + 12] // argb - mov ebx, [esp + 8 + 16] // yuvconstants - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READNV21 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - pop esi - ret - } -} - -// 8 pixels. -// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void YUY2ToARGBRow_SSSE3( - const uint8_t* src_yuy2, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUY2 - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -// 8 pixels. -// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) void UYVYToARGBRow_SSSE3( - const uint8_t* src_uyvy, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb - mov ebx, [esp + 4 + 12] // yuvconstants - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READUYVY - YUVTORGB(ebx) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebx - ret - } -} - -__declspec(naked) void I422ToRGBARow_SSSE3( - const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - - convertloop: - READYUV422 - YUVTORGB(ebx) - STORERGBA - - sub ecx, 8 - jg convertloop - - pop ebx - pop edi - pop esi - ret - } -} -#endif // HAS_I422TOARGBROW_SSSE3 - -// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter -#ifdef HAS_I400TOARGBROW_SSE2 -// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - movd xmm2, eax - pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - movd xmm3, eax - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y - pmulhuw xmm0, xmm2 - psubusw xmm0, xmm3 - psrlw xmm0, 6 - packuswb xmm0, xmm0 // G - - // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels - por xmm0, xmm4 - por xmm1, xmm4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_I400TOARGBROW_SSE2 - -#ifdef HAS_I400TOARGBROW_AVX2 -// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). -// note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, - uint8_t* rgb_buf, - const struct YuvConstants*, - int width) { - __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) - vmovd xmm2, eax - vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - vmovd xmm3, eax - vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 - vpslld ymm4, ymm4, 24 - - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width - - convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 - vmovdqu xmm0, [eax] - lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y - vpmulhuw ymm0, ymm0, ymm2 - vpsubusw ymm0, ymm0, ymm3 - vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates - vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels - vpor ymm0, ymm0, ymm4 - vpor ymm1, ymm1, ymm4 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_I400TOARGBROW_AVX2 - -#ifdef HAS_MIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -// TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - movdqa xmm5, xmmword ptr kShuffleMirror - - convertloop: - movdqu xmm0, [eax - 16 + ecx] - pshufb xmm0, xmm5 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} -#endif // HAS_MIRRORROW_SSSE3 - -#ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vbroadcastf128 ymm5, xmmword ptr kShuffleMirror - - convertloop: - vmovdqu ymm0, [eax - 32 + ecx] - vpshufb ymm0, ymm0, ymm5 - vpermq ymm0, ymm0, 0x4e // swap high and low halfs - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_MIRRORROW_AVX2 - -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - -__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - movdqa xmm1, xmmword ptr kShuffleMirrorUV - lea eax, [eax + ecx * 2 - 16] - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm1 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [edx + edi], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MIRRORSPLITUVROW_SSSE3 - -#ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - lea eax, [eax - 16 + ecx * 4] // last 4 pixels. - - convertloop: - movdqu xmm0, [eax] - lea eax, [eax - 16] - pshufd xmm0, xmm0, 0x1b - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBMIRRORROW_SSE2 - -#ifdef HAS_ARGBMIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 - - convertloop: - vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBMIRRORROW_AVX2 - -#ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [edx], xmm0 - movdqu [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -#endif // HAS_SPLITUVROW_SSE2 - -#ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes - vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpackuswb ymm2, ymm2, ymm3 - vpermq ymm0, ymm0, 0xd8 - vpermq ymm2, ymm2, 0xd8 - vmovdqu [edx], ymm0 - vmovdqu [edx + edi], ymm2 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_SPLITUVROW_AVX2 - -#ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - movdqu xmm0, [eax] // read 16 U's - movdqu xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqu [edi], xmm0 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_MERGEUVROW_SSE2 - -#ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's - lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 - vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 - vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 - vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 - lea edi, [edi + 64] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_MERGEUVROW_AVX2 - -#ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) void CopyRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - test eax, 15 - jne convertloopu - test edx, 15 - jne convertloopu - - convertloopa: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopa - ret - - convertloopu: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloopu - ret - } -} -#endif // HAS_COPYROW_SSE2 - -#ifdef HAS_COPYROW_AVX -// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) void CopyRow_AVX(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 64 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_COPYROW_AVX - -// Multiple of 1. -__declspec(naked) void CopyRow_ERMS(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // width - rep movsb - mov edi, edx - mov esi, eax - ret - } -} - -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movdqu xmm2, [eax] - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + 32] - lea eax, [eax + 64] - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_AVX2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - - extractloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrld xmm0, 24 - psrld xmm1, 24 - packssdw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg extractloop - - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 - -#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a - mov ecx, [esp + 12] // width - vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX - - extractloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpsrld ymm0, ymm0, 24 - vpsrld ymm1, ymm1, 24 - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - lea eax, [eax + 128] - vpackssdw ymm0, ymm0, ymm1 // mutates - vpsrld ymm2, ymm2, 24 - vpsrld ymm3, ymm3, 24 - vpackssdw ymm2, ymm2, ymm3 // mutates - vpackuswb ymm0, ymm0, ymm2 // mutates - vpermd ymm0, ymm4, ymm0 // unmutate - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg extractloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 - pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff - psrld xmm1, 8 - - convertloop: - movq xmm2, qword ptr [eax] // 8 Y's - lea eax, [eax + 8] - punpcklbw xmm2, xmm2 - punpckhwd xmm3, xmm2 - punpcklwd xmm2, xmm2 - movdqu xmm4, [edx] - movdqu xmm5, [edx + 16] - pand xmm2, xmm0 - pand xmm3, xmm0 - pand xmm4, xmm1 - pand xmm5, xmm1 - por xmm2, xmm4 - por xmm3, xmm5 - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 - -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -// width in pixels -__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, - uint8_t* dst, - int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - - convertloop: - vpmovzxbd ymm1, qword ptr [eax] - vpmovzxbd ymm2, qword ptr [eax + 8] - lea eax, [eax + 16] - vpslld ymm1, ymm1, 24 - vpslld ymm2, ymm2, 24 - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 - -#ifdef HAS_SETROW_X86 -// Write 'width' bytes using an 8 bit value repeated. -// width should be multiple of 4. -__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { - __asm { - movzx eax, byte ptr [esp + 8] // v8 - mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. - mov edx, edi - mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // width - shr ecx, 2 - rep stosd - mov edi, edx - ret - } -} - -// Write 'width' bytes using an 8 bit value repeated. -__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // width - rep stosb - mov edi, edx - ret - } -} - -// Write 'width' 32 bit values. -__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, - uint32_t v32, - int width) { - __asm { - mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // width - rep stosd - mov edi, edx - ret - } -} -#endif // HAS_SETROW_X86 - -#ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - pop esi - vzeroupper - ret - } -} - -__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. - vpermq ymm0, ymm0, 0xd8 - vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V - vpackuswb ymm1, ymm1, ymm1 // mutates. - vpackuswb ymm0, ymm0, ymm0 // mutates. - vpermq ymm1, ymm1, 0xd8 - vpermq ymm0, ymm0, 0xd8 - vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V - lea edx, [edx + 16] - sub ecx, 32 - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_YUY2TOYROW_AVX2 - -#ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_y, - int width) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} - -__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} -#endif // HAS_YUY2TOYROW_SSE2 - -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm6, eax - pshufd xmm6, xmm6, 0x00 - - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - movd xmm7, eax - pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 8 pixel loop. - convertloop8: - movq xmm0, qword ptr [esi] // alpha - punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a - movq xmm1, qword ptr [eax + esi] // src0 - movq xmm2, qword ptr [edx + esi] // src1 - punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 - pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edi + esi], xmm0 - lea esi, [esi + 8] - sub ecx, 8 - jg convertloop8 - - pop edi - pop esi - ret - } -} -#endif // HAS_BLENDPLANEROW_SSSE3 - -#ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 32 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - __asm { - push esi - push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 - vpsllw ymm5, ymm5, 8 - mov eax, 0x80808080 // 128 for biasing image to signed. - vmovd xmm6, eax - vbroadcastss ymm6, xmm6 - mov eax, 0x807f807f // 32768 + 127 for unbias and round. - vmovd xmm7, eax - vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 - mov esi, [esp + 8 + 12] // alpha - mov edi, [esp + 8 + 16] // dst - mov ecx, [esp + 8 + 20] // width - sub eax, esi - sub edx, esi - sub edi, esi - - // 32 pixel loop. - convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a - vmovdqu ymm1, [eax + esi] // src0 - vmovdqu ymm2, [edx + esi] // src1 - vpunpckhbw ymm4, ymm1, ymm2 - vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 - vpmaddubsw ymm3, ymm3, ymm4 - vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. - vpsrlw ymm3, ymm3, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm3 - vmovdqu [edi + esi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg convertloop32 - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_BLENDPLANEROW_AVX2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; - -// Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - sub ecx, 4 - jl convertloop4b // less than 4 pixels? - - // 4 pixel loop. - convertloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop4b: - add ecx, 4 - 1 - jl convertloop1b - - // 1 pixel loop. - convertloop1: - movd xmm3, [eax] // src argb - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop1b: - pop esi - ret - } -} -#endif // HAS_ARGBBLENDROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, -}; -static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, -}; -__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 - pslld xmm3, 24 - movdqa xmm4, xmmword ptr kShuffleAlpha0 - movdqa xmm5, xmmword ptr kShuffleAlpha1 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha - lea eax, [eax + 16] - pand xmm2, xmm3 - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBATTENUATEROW_SSSE3 - -#ifdef HAS_ARGBATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; -__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 - vpslld ymm5, ymm5, 24 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpshufb ymm2, ymm0, ymm4 // low 4 alphas - vpshufb ymm3, ymm1, ymm4 // high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * a - vpmulhuw ymm1, ymm1, ymm3 // rgb * a - vpand ymm6, ymm6, ymm5 // isolate alpha - vpsrlw ymm0, ymm0, 8 - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vpor ymm0, ymm0, ymm6 // copy original alpha - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - lea ebx, fixed_invtbl8 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 3] // first alpha - movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a - - movdqu xmm1, [eax] // read 4 pixels - movzx esi, byte ptr [eax + 11] // third alpha - movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 - movd xmm2, dword ptr [ebx + esi * 4] - movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words - movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a - lea eax, [eax + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 - -#ifdef HAS_ARGBUNATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; -// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. -// USE_GATHER is not on by default, due to being a slow instruction. -#ifdef USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - vzeroupper - ret - } -} -#else // USE_GATHER -__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb - mov ecx, [esp + 12 + 12] // width - sub edx, eax - lea ebx, fixed_invtbl8 - vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 - - convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] - vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] - vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] - vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] - vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] - // end of VPGATHER - - vmovdqu ymm6, [eax] // read 8 pixels. - vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. - vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. - vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a - vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. - vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a - vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas - vpmulhuw ymm0, ymm0, ymm2 // rgb * ia - vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - vzeroupper - ret - } -} -#endif // USE_GATHER -#endif // HAS_ARGBATTENUATEROW_AVX2 - -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ - movdqa xmm4, xmmword ptr kARGBToYJ - movdqa xmm5, xmmword ptr kAddYJ64 - - convertloop: - movdqu xmm0, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // Add .5 for rounding. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes - movdqu xmm2, [eax] // A - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - psrld xmm2, 24 - psrld xmm3, 24 - packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 -// b = (r * 35 + g * 68 + b * 17) >> 7 -// g = (r * 45 + g * 88 + b * 22) >> 7 -// r = (r * 50 + g * 98 + b * 24) >> 7 -// Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, - 17, 68, 35, 0, 17, 68, 35, 0}; - -static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, - 22, 88, 45, 0, 22, 88, 45, 0}; - -static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, - 24, 98, 50, 0, 24, 98, 50, 0}; - -// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ - movdqa xmm2, xmmword ptr kARGBToSepiaB - movdqa xmm3, xmmword ptr kARGBToSepiaG - movdqa xmm4, xmmword ptr kARGBToSepiaR - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm6, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm6, xmm2 - phaddw xmm0, xmm6 - psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values - movdqu xmm5, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm3 - pmaddubsw xmm1, xmm3 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values - movdqu xmm5, [eax] // R - movdqu xmm1, [eax + 16] - pmaddubsw xmm5, xmm4 - pmaddubsw xmm1, xmm4 - phaddw xmm5, xmm1 - psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values - movdqu xmm6, [eax] // A - movdqu xmm1, [eax + 16] - psrld xmm6, 24 - psrld xmm1, 24 - packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 - movdqu [eax], xmm0 - movdqu [eax + 16], xmm1 - lea eax, [eax + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBSEPIAROW_SSSE3 - -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R -// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ - movdqu xmm5, [ecx] - pshufd xmm2, xmm5, 0x00 - pshufd xmm3, xmm5, 0x55 - pshufd xmm4, xmm5, 0xaa - pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ - - convertloop: - movdqu xmm0, [eax] // B - movdqu xmm7, [eax + 16] - pmaddubsw xmm0, xmm2 - pmaddubsw xmm7, xmm2 - movdqu xmm6, [eax] // G - movdqu xmm1, [eax + 16] - pmaddubsw xmm6, xmm3 - pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values - movdqu xmm1, [eax] // R - movdqu xmm7, [eax + 16] - pmaddubsw xmm1, xmm4 - pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R - movdqu xmm6, [eax] // A - movdqu xmm7, [eax + 16] - pmaddubsw xmm6, xmm5 - pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm6 - lea eax, [eax + 32] - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 - -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ - pshuflw xmm2, xmm2, 040h - pshufd xmm2, xmm2, 044h - pshuflw xmm3, xmm3, 040h - pshufd xmm3, xmm3, 044h - pshuflw xmm4, xmm4, 040h - pshufd xmm4, xmm4, 044h - pxor xmm5, xmm5 // constant 0 - pcmpeqb xmm6, xmm6 // generate mask 0xff000000 - pslld xmm6, 24 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels - pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size - movdqu xmm7, [eax] // read 4 pixels - pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 - paddw xmm1, xmm4 - packuswb xmm0, xmm1 - por xmm0, xmm7 - movdqu [eax], xmm0 - lea eax, [eax + 16] - sub ecx, 4 - jg convertloop - ret - } -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - movd xmm2, [esp + 16] // value - punpcklbw xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pxor xmm5, xmm5 // constant 0 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - movdqu xmm2, [esi] // read 4 pixels from src_argb1 - movdqu xmm1, xmm0 - movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 - lea eax, [eax + 16] - lea esi, [esi + 16] - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 - -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -// TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - sub ecx, 4 - jl convertloop49 - - convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop49: - add ecx, 4 - 1 - jl convertloop19 - - convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb - lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 - lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb + src_argb1 - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop19: - pop esi - ret - } -} -#endif // HAS_ARGBADDROW_SSE2 - -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb - lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 - lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb - src_argb1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 - - convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 - lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 - vpackuswb ymm0, ymm0, ymm1 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBMULTIPLYROW_AVX2 - -#ifdef HAS_ARGBADDROW_AVX2 -// Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBADDROW_AVX2 - -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - - convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb - lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 - lea esi, [esi + 32] - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_ARGBSUBTRACTROW_AVX2 - -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 - mov edi, [esp + 8 + 12] // src_y2 - mov edx, [esp + 8 + 16] // dst_sobelx - mov ecx, [esp + 8 + 20] // width - sub esi, eax - sub edi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] - movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_SOBELXROW_SSE2 - -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 - mov edx, [esp + 4 + 12] // dst_sobely - mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - pxor xmm5, xmm5 // constant 0 - - convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] - movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] - movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - psubw xmm2, xmm3 - paddw xmm0, xmm2 - paddw xmm0, xmm1 - paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw - psubw xmm1, xmm0 - pmaxsw xmm0, xmm1 - packuswb xmm0, xmm0 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELYROW_SSE2 - -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA - por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA - por xmm0, xmm5 - movdqu [edx], xmm1 - movdqu [edx + 16], xmm2 - movdqu [edx + 32], xmm3 - movdqu [edx + 48], xmm0 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELROW_SSE2 - -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - - convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely - lea eax, [eax + 16] - movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA - punpcklbw xmm3, xmm5 - punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS - punpcklbw xmm4, xmm2 - punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 - movdqu [edx], xmm6 - movdqu [edx + 16], xmm4 - movdqu [edx + 32], xmm7 - movdqu [edx + 48], xmm1 - lea edx, [edx + 64] - sub ecx, 16 - jg convertloop - - pop esi - ret - } -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -// Consider float CumulativeSum. -// Consider calling CumulativeSum one row at time as needed. -// Consider circular CumulativeSum buffer of radius * 2 + 1 height. -// Convert cumulative sum for an area to an average for 1 pixel. -// topleft is pointer to top left of CumulativeSum buffer for area. -// botleft is pointer to bottom left of CumulativeSum buffer. -// width is offset from left to right of area in CumulativeSum buffer measured -// in number of ints. -// area is the number of pixels in the area being averaged. -// dst points to pixel to store result to. -// count is number of averaged pixels to produce. -// Does 4 pixels at a time. -// This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - __asm { - mov eax, topleft // eax topleft - mov esi, botleft // esi botleft - mov edx, width - movd xmm5, area - mov edi, dst - mov ecx, count - cvtdq2ps xmm5, xmm5 - rcpss xmm4, xmm5 // 1.0f / area - pshufd xmm4, xmm4, 0 - sub ecx, 4 - jl l4b - - cmp area, 128 // 128 pixels will not overflow 15 bits. - ja l4 - - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 - psrld xmm6, 16 - cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts - - // 4 pixel loop small blocks. - s4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - packssdw xmm0, xmm1 // pack 4 pixels into 2 registers - packssdw xmm2, xmm3 - - pmulhuw xmm0, xmm5 - pmulhuw xmm2, xmm5 - - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge s4 - - jmp l4b - - // 4 pixel loop - l4: - // top left - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - - // - top right - psubd xmm0, [eax + edx * 4] - psubd xmm1, [eax + edx * 4 + 16] - psubd xmm2, [eax + edx * 4 + 32] - psubd xmm3, [eax + edx * 4 + 48] - lea eax, [eax + 64] - - // - bottom left - psubd xmm0, [esi] - psubd xmm1, [esi + 16] - psubd xmm2, [esi + 32] - psubd xmm3, [esi + 48] - - // + bottom right - paddd xmm0, [esi + edx * 4] - paddd xmm1, [esi + edx * 4 + 16] - paddd xmm2, [esi + edx * 4 + 32] - paddd xmm3, [esi + edx * 4 + 48] - lea esi, [esi + 64] - - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area - cvtdq2ps xmm1, xmm1 - mulps xmm0, xmm4 - mulps xmm1, xmm4 - cvtdq2ps xmm2, xmm2 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - cvtps2dq xmm0, xmm0 - cvtps2dq xmm1, xmm1 - cvtps2dq xmm2, xmm2 - cvtps2dq xmm3, xmm3 - packssdw xmm0, xmm1 - packssdw xmm2, xmm3 - packuswb xmm0, xmm2 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movdqu xmm0, [eax] - psubd xmm0, [eax + edx * 4] - lea eax, [eax + 16] - psubd xmm0, [esi] - paddd xmm0, [esi + edx * 4] - lea esi, [esi + 16] - cvtdq2ps xmm0, xmm0 - mulps xmm0, xmm4 - cvtps2dq xmm0, xmm0 - packssdw xmm0, xmm0 - packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] - sub ecx, 1 - jge l1 - l1b: - } -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - __asm { - mov eax, row - mov edx, cumsum - mov esi, previous_cumsum - mov ecx, width - pxor xmm0, xmm0 - pxor xmm1, xmm1 - - sub ecx, 4 - jl l4b - test edx, 15 - jne l4b - - // 4 pixel loop - l4: - movdqu xmm2, [eax] // 4 argb pixels 16 bytes. - lea eax, [eax + 16] - movdqa xmm4, xmm2 - - punpcklbw xmm2, xmm1 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm1 - punpckhwd xmm3, xmm1 - - punpckhbw xmm4, xmm1 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm1 - punpckhwd xmm5, xmm1 - - paddd xmm0, xmm2 - movdqu xmm2, [esi] // previous row above. - paddd xmm2, xmm0 - - paddd xmm0, xmm3 - movdqu xmm3, [esi + 16] - paddd xmm3, xmm0 - - paddd xmm0, xmm4 - movdqu xmm4, [esi + 32] - paddd xmm4, xmm0 - - paddd xmm0, xmm5 - movdqu xmm5, [esi + 48] - lea esi, [esi + 64] - paddd xmm5, xmm0 - - movdqu [edx], xmm2 - movdqu [edx + 16], xmm3 - movdqu [edx + 32], xmm4 - movdqu [edx + 48], xmm5 - - lea edx, [edx + 64] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - movd xmm2, dword ptr [eax] // 1 argb pixel - lea eax, [eax + 4] - punpcklbw xmm2, xmm1 - punpcklwd xmm2, xmm1 - paddd xmm0, xmm2 - movdqu xmm2, [esi] - lea esi, [esi + 16] - paddd xmm2, xmm0 - movdqu [edx], xmm2 - lea edx, [edx + 16] - sub ecx, 1 - jge l1 - - l1b: - } -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 -// Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* uv_dudv, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 12] // src_argb - mov esi, [esp + 16] // stride - mov edx, [esp + 20] // dst_argb - mov ecx, [esp + 24] // pointer to uv_dudv - movq xmm2, qword ptr [ecx] // uv - movq xmm7, qword ptr [ecx + 8] // dudv - mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride - add esi, 4 - movd xmm5, esi - sub ecx, 4 - jl l4b - - // setup for 4 pixel loop - pshufd xmm7, xmm7, 0x44 // dup dudv - pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 - addps xmm0, xmm7 - movlhps xmm2, xmm0 - movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 - addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 - - // 4 pixel loop - l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd xmm1, [eax + esi] // read pixel 0 - movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 - movq qword ptr [edx], xmm1 - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right - movd edi, xmm0 - movd xmm6, [eax + esi] // read pixel 2 - movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 - movq qword ptr 8[edx], xmm6 - lea edx, [edx + 16] - sub ecx, 4 - jge l4 - - l4b: - add ecx, 4 - 1 - jl l1b - - // 1 pixel loop - l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy - movd esi, xmm0 - movd xmm0, [eax + esi] // copy a pixel - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge l1 - l1b: - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBAFFINEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_AVX2 -// Bilinear filter 32x2 -> 32x1 -__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - sub edi, esi - cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. - - vmovd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - vmovd xmm5, eax // low fraction 256..1 - vpunpcklbw xmm5, xmm5, xmm0 - vpunpcklwd xmm5, xmm5, xmm5 - vbroadcastss ymm5, xmm5 - - mov eax, 0x80808080 // 128b for bias and rounding. - vmovd xmm4, eax - vbroadcastss ymm4, xmm4 - - xloop: - vmovdqu ymm0, [esi] - vmovdqu ymm2, [esi + edx] - vpunpckhbw ymm1, ymm0, ymm2 // mutates - vpunpcklbw ymm0, ymm0, ymm2 - vpsubb ymm1, ymm1, ymm4 // bias to signed image - vpsubb ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm5, ymm1 - vpmaddubsw ymm0, ymm5, ymm0 - vpaddw ymm1, ymm1, ymm4 // unbias and round - vpaddw ymm0, ymm0, ymm4 - vpsrlw ymm1, ymm1, 8 - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - vmovdqu ymm0, [esi] - vpavgb ymm0, ymm0, [esi + edx] - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - rep movsb - - xloop99: - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_INTERPOLATEROW_AVX2 - -// Bilinear filter 16x2 -> 16x1 -// TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 /256. Blend 100 / 0. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - - movd xmm0, eax // high fraction 0..255 - neg eax - add eax, 256 - movd xmm5, eax // low fraction 255..1 - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - mov eax, 0x80808080 // 128 for biasing image to signed. - movd xmm4, eax - pshufd xmm4, xmm4, 0x00 - - xloop: - movdqu xmm0, [esi] - movdqu xmm2, [esi + edx] - movdqu xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 - psubb xmm1, xmm4 - movdqa xmm2, xmm5 - movdqa xmm3, xmm5 - pmaddubsw xmm2, xmm0 - pmaddubsw xmm3, xmm1 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - psrlw xmm2, 8 - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqu [esi + edi], xmm2 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop - jmp xloop99 - - // Blend 50 / 50. - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop50 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - movdqu xmm0, [esi] - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} - -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - ret - } -} - -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpshufb ymm0, ymm0, ymm5 - vpshufb ymm1, ymm1, ymm5 - vmovdqu [edx], ymm0 - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} -#endif // HAS_ARGBSHUFFLEROW_AVX2 - -// YUY2 - Macro-pixel = 2 image pixels -// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... - -// UYVY - Macro-pixel = 2 image pixels -// U0Y0V0Y1 - -__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV - punpckhbw xmm1, xmm2 - movdqu [edi], xmm0 - movdqu [edi + 16], xmm1 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y - movdqa xmm1, xmm2 - lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY - punpckhbw xmm2, xmm0 - movdqu [edi], xmm1 - movdqu [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ - pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - - // 2 pixel loop. - convertloop: - // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel - // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel - movq xmm0, qword ptr [eax] // BGRABGRA - lea eax, [eax + 8] - punpcklbw xmm0, xmm3 - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm3 // pixel 0 - punpckhwd xmm4, xmm3 // pixel 1 - cvtdq2ps xmm0, xmm0 // 4 floats - cvtdq2ps xmm4, xmm4 - movdqa xmm1, xmm0 // X - movdqa xmm5, xmm4 - mulps xmm0, [esi + 16] // C1 * X - mulps xmm4, [esi + 16] - addps xmm0, [esi] // result = C0 + C1 * X - addps xmm4, [esi] - movdqa xmm2, xmm1 - movdqa xmm6, xmm5 - mulps xmm2, xmm1 // X * X - mulps xmm6, xmm5 - mulps xmm1, xmm2 // X * X * X - mulps xmm5, xmm6 - mulps xmm2, [esi + 32] // C2 * X * X - mulps xmm6, [esi + 32] - mulps xmm1, [esi + 48] // C3 * X * X * X - mulps xmm5, [esi + 48] - addps xmm0, xmm2 // result += C2 * X * X - addps xmm4, xmm6 - addps xmm0, xmm1 // result += C3 * X * X * X - addps xmm4, xmm5 - cvttps2dq xmm0, xmm0 - cvttps2dq xmm4, xmm4 - packuswb xmm0, xmm4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 - -#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 - vbroadcastf128 ymm5, [ecx + 16] // C1 - vbroadcastf128 ymm6, [ecx + 32] // C2 - vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ - - // 2 pixel loop. - convertloop: - vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels - lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats - vmulps ymm2, ymm0, ymm0 // X * X - vmulps ymm3, ymm0, ymm7 // C3 * X - vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X - vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X - vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X - vcvttps2dq ymm0, ymm0 - vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 - vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 - vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 - vmovq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 2 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_ARGBPOLYNOMIALROW_AVX2 - -#ifdef HAS_HALFFLOATROW_SSE2 -static float kExpBias = 1.9259299444e-34f; -__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - mulss xmm4, kExpBias - pshufd xmm4, xmm4, 0 - pxor xmm5, xmm5 - sub edx, eax - - // 8 pixel loop. - convertloop: - movdqu xmm2, xmmword ptr [eax] // 8 shorts - add eax, 16 - movdqa xmm3, xmm2 - punpcklwd xmm2, xmm5 - cvtdq2ps xmm2, xmm2 // convert 8 ints to floats - punpckhwd xmm3, xmm5 - cvtdq2ps xmm3, xmm3 - mulps xmm2, xmm4 - mulps xmm3, xmm4 - psrld xmm2, 13 - psrld xmm3, 13 - packssdw xmm2, xmm3 - movdqu [eax + edx - 16], xmm2 - sub ecx, 8 - jg convertloop - ret - } -} -#endif // HAS_HALFFLOATROW_SSE2 - -#ifdef HAS_HALFFLOATROW_AVX2 -__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - movd xmm4, dword ptr [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - - vmulss xmm4, xmm4, kExpBias - vbroadcastss ymm4, xmm4 - vpxor ymm5, ymm5, ymm5 - sub edx, eax - - // 16 pixel loop. - convertloop: - vmovdqu ymm2, [eax] // 16 shorts - add eax, 32 - vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints - vpunpcklwd ymm2, ymm2, ymm5 - vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats - vcvtdq2ps ymm2, ymm2 - vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. - vmulps ymm2, ymm2, ymm4 - vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate - vpsrld ymm2, ymm2, 13 - vpackssdw ymm2, ymm2, ymm3 - vmovdqu [eax + edx - 32], ymm2 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_AVX2 - -#ifdef HAS_HALFFLOATROW_F16C -__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ - vbroadcastss ymm4, [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ - sub edx, eax - - // 16 pixel loop. - convertloop: - vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints - vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts - add eax, 32 - vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats - vcvtdq2ps ymm3, ymm3 - vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 - vmulps ymm3, ymm3, ymm4 - vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate - vcvtps2ph xmm3, ymm3, 3 - vmovdqu [eax + edx + 32], xmm2 - vmovdqu [eax + edx + 32 + 16], xmm3 - sub ecx, 16 - jg convertloop - vzeroupper - ret - } -} -#endif // HAS_HALFFLOATROW_F16C - -#ifdef HAS_ARGBCOLORTABLEROW_X86 -// Tranform ARGB pixels with color table. -__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - movzx edx, byte ptr [eax - 4 + 3] - movzx edx, byte ptr [esi + edx * 4 + 3] - mov byte ptr [eax - 4 + 3], dl - dec ecx - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - -#ifdef HAS_RGBCOLORTABLEROW_X86 -// Tranform RGB pixels with color table. -__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, - const uint8_t* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - // 1 pixel loop. - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - dec ecx - jg convertloop - - pop esi - ret - } -} -#endif // HAS_RGBCOLORTABLEROW_X86 - -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ - movd xmm2, dword ptr [esp + 8 + 16] // luma table - movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff - pshufd xmm2, xmm2, 0 - pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 - psllw xmm4, 8 - pxor xmm5, xmm5 - - // 4 pixel loop. - convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr - pmaddubsw xmm0, xmm3 - phaddw xmm0, xmm0 - pand xmm0, xmm4 // mask out low bits - punpcklwd xmm0, xmm5 - paddd xmm0, xmm2 // add table base - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi], dl - movzx edx, byte ptr [eax + 1] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 1], dl - movzx edx, byte ptr [eax + 2] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 2], dl - movzx edx, byte ptr [eax + 3] // copy alpha. - mov byte ptr [edi + 3], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 4] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 4], dl - movzx edx, byte ptr [eax + 5] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 5], dl - movzx edx, byte ptr [eax + 6] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 6], dl - movzx edx, byte ptr [eax + 7] // copy alpha. - mov byte ptr [edi + 7], dl - - movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - - movzx edx, byte ptr [eax + 8] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 8], dl - movzx edx, byte ptr [eax + 9] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 9], dl - movzx edx, byte ptr [eax + 10] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 10], dl - movzx edx, byte ptr [eax + 11] // copy alpha. - mov byte ptr [edi + 11], dl - - movd esi, xmm0 - - movzx edx, byte ptr [eax + 12] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 12], dl - movzx edx, byte ptr [eax + 13] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 13], dl - movzx edx, byte ptr [eax + 14] - movzx edx, byte ptr [esi + edx] - mov byte ptr [edi + 14], dl - movzx edx, byte ptr [eax + 15] // copy alpha. - mov byte ptr [edi + 15], dl - - lea eax, [eax + 16] - lea edi, [edi + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 - -#endif // defined(_M_X64) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) diff --git a/thirdparty/libyuv/source/scale.cc b/thirdparty/libyuv/source/scale.cc deleted file mode 100644 index 03b0486..0000000 --- a/thirdparty/libyuv/source/scale.cc +++ /dev/null @@ -1,2385 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyPlane -#include "libyuv/row.h" -#include "libyuv/scale_row.h" -#include "libyuv/scale_uv.h" // For UVScale - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) - -// Scale plane, 1/2 -// This is an optimized version for scaling down a plane to 1/2 of -// its original size. - -static void ScalePlaneDown2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width) = - filtering == kFilterNone - ? ScaleRowDown2_C - : (filtering == kFilterLinear ? ScaleRowDown2Linear_C - : ScaleRowDown2Box_C); - int row_stride = src_stride << 1; - (void)src_width; - (void)src_height; - if (!filtering) { - src_ptr += src_stride; // Point to odd rows. - src_stride = 0; - } - -#if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_NEON - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON - : ScaleRowDown2Box_Any_NEON); - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_NEON - : ScaleRowDown2Box_NEON); - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_SSSE3 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 - : ScaleRowDown2Box_Any_SSSE3); - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_SSSE3 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 - : ScaleRowDown2Box_SSSE3); - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_AVX2 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 - : ScaleRowDown2Box_Any_AVX2); - if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_AVX2 - : ScaleRowDown2Box_AVX2); - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI - : ScaleRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_MMI - : ScaleRowDown2Box_MMI); - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_MSA - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA - : ScaleRowDown2Box_Any_MSA); - if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_MSA - : ScaleRowDown2Box_MSA); - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - // TODO(fbarchard): Loop through source height to allow odd height. - for (y = 0; y < dst_height; ++y) { - ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -static void ScalePlaneDown2_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width) = - filtering == kFilterNone - ? ScaleRowDown2_16_C - : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C - : ScaleRowDown2Box_16_C); - int row_stride = src_stride << 1; - (void)src_width; - (void)src_height; - if (!filtering) { - src_ptr += src_stride; // Point to odd rows. - src_stride = 0; - } - -#if defined(HAS_SCALEROWDOWN2_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = - filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; - } -#endif -#if defined(HAS_SCALEROWDOWN2_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_16_SSE2 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 - : ScaleRowDown2Box_16_SSE2); - } -#endif -#if defined(HAS_SCALEROWDOWN2_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_16_MMI - : ScaleRowDown2Box_16_MMI); - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - // TODO(fbarchard): Loop through source height to allow odd height. - for (y = 0; y < dst_height; ++y) { - ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -// Scale plane, 1/4 -// This is an optimized version for scaling down a plane to 1/4 of -// its original size. - -static void ScalePlaneDown4(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width) = - filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; - int row_stride = src_stride << 2; - (void)src_width; - (void)src_height; - if (!filtering) { - src_ptr += src_stride * 2; // Point to row 2. - src_stride = 0; - } -#if defined(HAS_SCALEROWDOWN4_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; - } - } -#endif -#if defined(HAS_SCALEROWDOWN4_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; - } - } -#endif -#if defined(HAS_SCALEROWDOWN4_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; - } - } -#endif -#if defined(HAS_SCALEROWDOWN4_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; - } - } -#endif -#if defined(HAS_SCALEROWDOWN4_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (y = 0; y < dst_height; ++y) { - ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -static void ScalePlaneDown4_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width) = - filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; - int row_stride = src_stride << 2; - (void)src_width; - (void)src_height; - if (!filtering) { - src_ptr += src_stride * 2; // Point to row 2. - src_stride = 0; - } -#if defined(HAS_SCALEROWDOWN4_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; - } -#endif -#if defined(HAS_SCALEROWDOWN4_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; - } -#endif -#if defined(HAS_SCALEROWDOWN4_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (y = 0; y < dst_height; ++y) { - ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -// Scale plane down, 3/4 -static void ScalePlaneDown34(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - (void)src_width; - (void)src_height; - assert(dst_width % 3 == 0); - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_C; - ScaleRowDown34_1 = ScaleRowDown34_C; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; - } -#if defined(HAS_SCALEROWDOWN34_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; - ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; - } - if (dst_width % 24 == 0) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_NEON; - ScaleRowDown34_1 = ScaleRowDown34_NEON; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN34_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_MMI; - ScaleRowDown34_1 = ScaleRowDown34_Any_MMI; - if (dst_width % 24 == 0) { - ScaleRowDown34_0 = ScaleRowDown34_MMI; - ScaleRowDown34_1 = ScaleRowDown34_MMI; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN34_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; - ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; - } - if (dst_width % 48 == 0) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_MSA; - ScaleRowDown34_1 = ScaleRowDown34_MSA; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; - } - if (dst_width % 24 == 0) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; - } - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); - } -} - -static void ScalePlaneDown34_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - (void)src_width; - (void)src_height; - assert(dst_width % 3 == 0); - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_C; - ScaleRowDown34_1 = ScaleRowDown34_16_C; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; - } -#if defined(HAS_SCALEROWDOWN34_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_NEON; - ScaleRowDown34_1 = ScaleRowDown34_16_NEON; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; - } - } -#endif -#if defined(HAS_SCALEROWDOWN34_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); - } -} - -// Scale plane, 3/8 -// This is an optimized version for scaling down a plane to 3/8 -// of its original size. -// -// Uses box filter arranges like this -// aaabbbcc -> abc -// aaabbbcc def -// aaabbbcc ghi -// dddeeeff -// dddeeeff -// dddeeeff -// ggghhhii -// ggghhhii -// Boxes are 3x3, 2x3, 3x2 and 2x2 - -static void ScalePlaneDown38(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - assert(dst_width % 3 == 0); - (void)src_width; - (void)src_height; - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_C; - ScaleRowDown38_2 = ScaleRowDown38_C; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; - } - -#if defined(HAS_SCALEROWDOWN38_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; - ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; - } - if (dst_width % 12 == 0) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_NEON; - ScaleRowDown38_2 = ScaleRowDown38_NEON; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; - } - } - } -#endif -#if defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; - } - if (dst_width % 12 == 0 && !filtering) { - ScaleRowDown38_3 = ScaleRowDown38_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_SSSE3; - } - if (dst_width % 6 == 0 && filtering) { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; - } - } -#endif -#if defined(HAS_SCALEROWDOWN38_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; - ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; - } - if (dst_width % 12 == 0) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_MSA; - ScaleRowDown38_2 = ScaleRowDown38_MSA; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; - } - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } -} - -static void ScalePlaneDown38_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - (void)src_width; - (void)src_height; - assert(dst_width % 3 == 0); - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_C; - ScaleRowDown38_2 = ScaleRowDown38_16_C; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; - } -#if defined(HAS_SCALEROWDOWN38_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_NEON; - ScaleRowDown38_2 = ScaleRowDown38_16_NEON; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; - } - } -#endif -#if defined(HAS_SCALEROWDOWN38_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } -} - -#define MIN1(x) ((x) < 1 ? 1 : (x)) - -static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { - uint32_t sum = 0u; - int x; - assert(iboxwidth > 0); - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - return sum; -} - -static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { - uint32_t sum = 0u; - int x; - assert(iboxwidth > 0); - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - return sum; -} - -static void ScaleAddCols2_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t* src_ptr, - uint8_t* dst_ptr) { - int i; - int scaletbl[2]; - int minboxwidth = dx >> 16; - int boxwidth; - scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); - scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - x += dx; - boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = - SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> - 16; - } -} - -static void ScaleAddCols2_16_C(int dst_width, - int boxheight, - int x, - int dx, - const uint32_t* src_ptr, - uint16_t* dst_ptr) { - int i; - int scaletbl[2]; - int minboxwidth = dx >> 16; - int boxwidth; - scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); - scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - x += dx; - boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> - 16; - } -} - -static void ScaleAddCols0_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t* src_ptr, - uint8_t* dst_ptr) { - int scaleval = 65536 / boxheight; - int i; - (void)dx; - src_ptr += (x >> 16); - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = src_ptr[i] * scaleval >> 16; - } -} - -static void ScaleAddCols1_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t* src_ptr, - uint8_t* dst_ptr) { - int boxwidth = MIN1(dx >> 16); - int scaleval = 65536 / (boxwidth * boxheight); - int i; - x >>= 16; - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; - x += boxwidth; - } -} - -static void ScaleAddCols1_16_C(int dst_width, - int boxheight, - int x, - int dx, - const uint32_t* src_ptr, - uint16_t* dst_ptr) { - int boxwidth = MIN1(dx >> 16); - int scaleval = 65536 / (boxwidth * boxheight); - int i; - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; - x += boxwidth; - } -} - -// Scale plane down to any dimensions, with interpolation. -// (boxfilter). -// -// Same method as SimpleScale, which is fixed point, outputting -// one pixel of destination using fixed point (16.16) to step -// through source, sampling a box of pixel with simple -// averaging. -static void ScalePlaneBox(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { - int j, k; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - { - // Allocate a row buffer of uint16_t. - align_buffer_64(row16, src_width * 2); - void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16_t* src_ptr, uint8_t* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C - : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, - int src_width) = ScaleAddRow_C; -#if defined(HAS_SCALEADDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleAddRow = ScaleAddRow_Any_SSE2; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_SSE2; - } - } -#endif -#if defined(HAS_SCALEADDROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleAddRow = ScaleAddRow_Any_AVX2; - if (IS_ALIGNED(src_width, 32)) { - ScaleAddRow = ScaleAddRow_AVX2; - } - } -#endif -#if defined(HAS_SCALEADDROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleAddRow = ScaleAddRow_Any_NEON; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_NEON; - } - } -#endif -#if defined(HAS_SCALEADDROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleAddRow = ScaleAddRow_Any_MMI; - if (IS_ALIGNED(src_width, 8)) { - ScaleAddRow = ScaleAddRow_MMI; - } - } -#endif -#if defined(HAS_SCALEADDROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleAddRow = ScaleAddRow_Any_MSA; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_MSA; - } - } -#endif - - for (j = 0; j < dst_height; ++j) { - int boxheight; - int iy = y >> 16; - const uint8_t* src = src_ptr + iy * src_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - boxheight = MIN1((y >> 16) - iy); - memset(row16, 0, src_width * 2); - for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16_t*)(row16), src_width); - src += src_stride; - } - ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); - dst_ptr += dst_stride; - } - free_aligned_buffer_64(row16); - } -} - -static void ScalePlaneBox_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - int j, k; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - { - // Allocate a row buffer of uint32_t. - align_buffer_64(row32, src_width * 4); - void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint32_t* src_ptr, uint16_t* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; - void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, - int src_width) = ScaleAddRow_16_C; - -#if defined(HAS_SCALEADDROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_16_SSE2; - } -#endif - -#if defined(HAS_SCALEADDROW_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { - ScaleAddRow = ScaleAddRow_16_MMI; - } -#endif - for (j = 0; j < dst_height; ++j) { - int boxheight; - int iy = y >> 16; - const uint16_t* src = src_ptr + iy * src_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - boxheight = MIN1((y >> 16) - iy); - memset(row32, 0, src_width * 4); - for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint32_t*)(row32), src_width); - src += src_stride; - } - ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); - dst_ptr += dst_stride; - } - free_aligned_buffer_64(row32); - } -} - -// Scale plane down with bilinear interpolation. -void ScalePlaneBilinearDown(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. - // Allocate a row buffer. - align_buffer_64(row, src_width); - - const int max_y = (src_height - 1) << 16; - int j; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(src_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(src_width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - -#if defined(HAS_SCALEFILTERCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEFILTERCOLS_NEON) - if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEFILTERCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - ScaleFilterCols = ScaleFilterCols_MSA; - } - } -#endif - if (y > max_y) { - y = max_y; - } - - for (j = 0; j < dst_height; ++j) { - int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; - if (filtering == kFilterLinear) { - ScaleFilterCols(dst_ptr, src, dst_width, x, dx); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, row, dst_width, x, dx); - } - dst_ptr += dst_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - } - free_aligned_buffer_64(row); -} - -void ScalePlaneBilinearDown_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. - // Allocate a row buffer. - align_buffer_64(row, src_width * 2); - - const int max_y = (src_height - 1) << 16; - int j; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; - if (IS_ALIGNED(src_width, 32)) { - InterpolateRow = InterpolateRow_16_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_NEON; - } - } -#endif - -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; - } -#endif - if (y > max_y) { - y = max_y; - } - - for (j = 0; j < dst_height; ++j) { - int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; - if (filtering == kFilterLinear) { - ScaleFilterCols(dst_ptr, src, dst_width, x, dx); - } else { - int yf = (y >> 8) & 255; - InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); - } - dst_ptr += dst_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - } - free_aligned_buffer_64(row); -} - -// Scale up down with bilinear interpolation. -void ScalePlaneBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr, - enum FilterMode filtering) { - int j; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, - int dst_width, int x, int dx) = - filtering ? ScaleFilterCols_C : ScaleCols_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif - - if (filtering && src_width >= 32768) { - ScaleFilterCols = ScaleFilterCols64_C; - } -#if defined(HAS_SCALEFILTERCOLS_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEFILTERCOLS_NEON) - if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEFILTERCOLS_MSA) - if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - ScaleFilterCols = ScaleFilterCols_MSA; - } - } -#endif - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleFilterCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_SSE2; - } -#endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_MMI; - } -#endif - } - - if (y > max_y) { - y = max_y; - } - { - int yi = y >> 16; - const uint8_t* src = src_ptr + yi * src_stride; - - // Allocate 2 row buffers. - const int kRowSize = (dst_width + 31) & ~31; - align_buffer_64(row, kRowSize * 2); - - uint8_t* rowptr = row; - int rowstride = kRowSize; - int lasty = yi; - - ScaleFilterCols(rowptr, src, dst_width, x, dx); - if (src_height > 1) { - src += src_stride; - } - ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - src = src_ptr + yi * src_stride; - } - if (yi != lasty) { - ScaleFilterCols(rowptr, src, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - src += src_stride; - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); - } - dst_ptr += dst_stride; - y += dy; - } - free_aligned_buffer_64(row); - } -} - -// Scale plane, horizontally up by 2 times. -// Uses linear filter horizontally, nearest vertically. -// This is an optimized version for scaling up a plane to 2 times of -// its original width, using linear interpolation. -// This is used to scale U and V planes of I422 to I444. -void ScalePlaneUp2_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { - void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = - ScaleRowUp2_Linear_Any_C; - int i; - int y; - int dy; - - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_NEON - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; - } -#endif - - if (dst_height == 1) { - ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, - dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); - dst_ptr += dst_stride; - y += dy; - } - } -} - -// Scale plane, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// This is used to scale U and V planes of I420 to I444. -void ScalePlaneUp2_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { - void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_Any_C; - int x; - - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_NEON - if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; - } -#endif - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO(fbarchard): Test performance of writing one row of destination at a - // time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -// Scale at most 14 bit plane, horizontally up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original width, using linear interpolation. -// stride is in count of uint16_t. -// This is used to scale U and V planes of I210 to I410 and I212 to I412. -void ScalePlaneUp2_12_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, - int dst_width) = ScaleRowUp2_Linear_16_Any_C; - int i; - int y; - int dy; - - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_NEON - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; - } -#endif - - if (dst_height == 1) { - ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, - dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); - dst_ptr += dst_stride; - y += dy; - } - } -} - -// Scale at most 12 bit plane, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// stride is in count of uint16_t. -// This is used to scale U and V planes of I010 to I410 and I012 to I412. -void ScalePlaneUp2_12_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; - int x; - - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON - if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; - } -#endif - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -void ScalePlaneUp2_16_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, - int dst_width) = ScaleRowUp2_Linear_16_Any_C; - int i; - int y; - int dy; - - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_NEON - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; - } -#endif - - if (dst_height == 1) { - ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, - dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); - dst_ptr += dst_stride; - y += dy; - } - } -} - -void ScalePlaneUp2_16_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; - int x; - - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON - if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; - } -#endif - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -void ScalePlaneBilinearUp_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr, - enum FilterMode filtering) { - int j; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, - int dst_width, int x, int dx) = - filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; - if (IS_ALIGNED(dst_width, 32)) { - InterpolateRow = InterpolateRow_16_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_NEON; - } - } -#endif - - if (filtering && src_width >= 32768) { - ScaleFilterCols = ScaleFilterCols64_16_C; - } -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; - } -#endif - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleFilterCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_SSE2; - } -#endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_MMI; - } -#endif - } - - if (y > max_y) { - y = max_y; - } - { - int yi = y >> 16; - const uint16_t* src = src_ptr + yi * src_stride; - - // Allocate 2 row buffers. - const int kRowSize = (dst_width + 31) & ~31; - align_buffer_64(row, kRowSize * 4); - - uint16_t* rowptr = (uint16_t*)row; - int rowstride = kRowSize; - int lasty = yi; - - ScaleFilterCols(rowptr, src, dst_width, x, dx); - if (src_height > 1) { - src += src_stride; - } - ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - src = src_ptr + yi * src_stride; - } - if (yi != lasty) { - ScaleFilterCols(rowptr, src, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - src += src_stride; - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); - } - dst_ptr += dst_stride; - y += dy; - } - free_aligned_buffer_64(row); - } -} - -// Scale Plane to/from any dimensions, without interpolation. -// Fixed point math is used for performance: The upper 16 bits -// of x and dx is the integer part of the source position and -// the lower 16 bits are the fixed decimal part. - -static void ScalePlaneSimple(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { - int i; - void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, - int x, int dx) = ScaleCols_C; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - - if (src_width * 2 == dst_width && x < 0x8000) { - ScaleCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_SSE2; - } -#endif -#if defined(HAS_SCALECOLS_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_MMI; - } -#endif - } - - for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); - dst_ptr += dst_stride; - y += dy; - } -} - -static void ScalePlaneSimple_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - int i; - void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, - int x, int dx) = ScaleCols_16_C; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - - if (src_width * 2 == dst_width && x < 0x8000) { - ScaleCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_SSE2; - } -#endif -#if defined(HAS_SCALECOLS_16_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_MMI; - } -#endif - } - - for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); - dst_ptr += dst_stride; - y += dy; - } -} - -// Scale a plane. -// This function dispatches to a specialized scaler based on scale factor. - -LIBYUV_API -void ScalePlane(const uint8_t* src, - int src_stride, - int src_width, - int src_height, - uint8_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering) { - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * src_stride; - src_stride = -src_stride; - } - - // Use specialized scales to improve performance for common resolutions. - // For example, all the 1/2 scalings will use ScalePlaneDown2() - if (dst_width == src_width && dst_height == src_height) { - // Straight copy. - CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); - return; - } - if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); - return; - } - if (dst_width <= Abs(src_width) && dst_height <= src_height) { - // Scale down. - if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { - // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - if (2 * dst_width == src_width && 2 * dst_height == src_height) { - // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { - // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - if (4 * dst_width == src_width && 4 * dst_height == src_height && - (filtering == kFilterBox || filtering == kFilterNone)) { - // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - } - if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); - return; - } - if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if (filtering && dst_height > src_height) { - ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - if (filtering) { - ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); -} - -LIBYUV_API -void ScalePlane_16(const uint16_t* src, - int src_stride, - int src_width, - int src_height, - uint16_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering) { - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * src_stride; - src_stride = -src_stride; - } - - // Use specialized scales to improve performance for common resolutions. - // For example, all the 1/2 scalings will use ScalePlaneDown2() - if (dst_width == src_width && dst_height == src_height) { - // Straight copy. - CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); - return; - } - if (dst_width == src_width && filtering != kFilterBox) { - int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, 0, dy, 1, filtering); - return; - } - if (dst_width <= Abs(src_width) && dst_height <= src_height) { - // Scale down. - if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { - // optimized, 3/4 - ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - if (2 * dst_width == src_width && 2 * dst_height == src_height) { - // optimized, 1/2 - ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { - // optimized, 3/8 - ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - if (4 * dst_width == src_width && 4 * dst_height == src_height && - (filtering == kFilterBox || filtering == kFilterNone)) { - // optimized, 1/4 - ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - } - if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); - return; - } - if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if (filtering && dst_height > src_height) { - ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - if (filtering) { - ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); -} - -LIBYUV_API -void ScalePlane_12(const uint16_t* src, - int src_stride, - int src_width, - int src_height, - uint16_t* dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering) { - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * src_stride; - src_stride = -src_stride; - } - - if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - - ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, - dst_width, dst_height, filtering); -} - -// Scale an I420 image. -// This function in turn calls a scaling function for each plane. - -LIBYUV_API -int I420Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, - dst_stride_u, dst_halfwidth, dst_halfheight, filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, - dst_stride_v, dst_halfwidth, dst_halfheight, filtering); - return 0; -} - -LIBYUV_API -int I420Scale_16(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, - dst_stride_u, dst_halfwidth, dst_halfheight, filtering); - ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, - dst_stride_v, dst_halfwidth, dst_halfheight, filtering); - return 0; -} - -LIBYUV_API -int I420Scale_12(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, - dst_stride_u, dst_halfwidth, dst_halfheight, filtering); - ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, - dst_stride_v, dst_halfwidth, dst_halfheight, filtering); - return 0; -} - -// Scale an I444 image. -// This function in turn calls a scaling function for each plane. - -LIBYUV_API -int I444Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, - dst_width, dst_height, filtering); - ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, - dst_width, dst_height, filtering); - return 0; -} - -LIBYUV_API -int I444Scale_16(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, - dst_width, dst_height, filtering); - ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, - dst_width, dst_height, filtering); - return 0; -} - -LIBYUV_API -int I444Scale_12(const uint16_t* src_y, - int src_stride_y, - const uint16_t* src_u, - int src_stride_u, - const uint16_t* src_v, - int src_stride_v, - int src_width, - int src_height, - uint16_t* dst_y, - int dst_stride_y, - uint16_t* dst_u, - int dst_stride_u, - uint16_t* dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, - dst_width, dst_height, filtering); - ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, - dst_width, dst_height, filtering); - return 0; -} - -// Scale an NV12 image. -// This function in turn calls a scaling function for each plane. - -LIBYUV_API -int NV12Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - if (!src_y || !src_uv || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, - dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); - return 0; -} - -// Deprecated api -LIBYUV_API -int Scale(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - int src_stride_y, - int src_stride_u, - int src_stride_v, - int src_width, - int src_height, - uint8_t* dst_y, - uint8_t* dst_u, - uint8_t* dst_v, - int dst_stride_y, - int dst_stride_u, - int dst_stride_v, - int dst_width, - int dst_height, - LIBYUV_BOOL interpolate) { - return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, src_width, src_height, dst_y, dst_stride_y, - dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, - dst_height, interpolate ? kFilterBox : kFilterNone); -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_any.cc b/thirdparty/libyuv/source/scale_any.cc deleted file mode 100644 index 965749c..0000000 --- a/thirdparty/libyuv/source/scale_any.cc +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // For memset/memcpy - -#include "libyuv/scale.h" -#include "libyuv/scale_row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Fixed scale down. -// Mask may be non-power of 2, so use MOD -#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } - -// Fixed scale down for odd source width. Used by I420Blend subsampling. -// Since dst_width is (width + 1) / 2, this function scales one less pixel -// and copies the last pixel. -#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ - int n = (dst_width - 1) - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r + 1); \ - } - -#ifdef HAS_SCALEROWDOWN2_SSSE3 -SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSSE3, - ScaleRowDown2Linear_SSSE3, - ScaleRowDown2Linear_C, - 2, - 1, - 15) -SDANY(ScaleRowDown2Box_Any_SSSE3, - ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_C, - 2, - 1, - 15) -SDODD(ScaleRowDown2Box_Odd_SSSE3, - ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 15) -#endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 -SDANY(ScaleUVRowDown2Box_Any_SSSE3, - ScaleUVRowDown2Box_SSSE3, - ScaleUVRowDown2Box_C, - 2, - 2, - 4) -#endif -#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 -SDANY(ScaleUVRowDown2Box_Any_AVX2, - ScaleUVRowDown2Box_AVX2, - ScaleUVRowDown2Box_C, - 2, - 2, - 8) -#endif -#ifdef HAS_SCALEROWDOWN2_AVX2 -SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) -SDANY(ScaleRowDown2Linear_Any_AVX2, - ScaleRowDown2Linear_AVX2, - ScaleRowDown2Linear_C, - 2, - 1, - 31) -SDANY(ScaleRowDown2Box_Any_AVX2, - ScaleRowDown2Box_AVX2, - ScaleRowDown2Box_C, - 2, - 1, - 31) -SDODD(ScaleRowDown2Box_Odd_AVX2, - ScaleRowDown2Box_AVX2, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 31) -#endif -#ifdef HAS_SCALEROWDOWN2_NEON -SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_NEON, - ScaleRowDown2Linear_NEON, - ScaleRowDown2Linear_C, - 2, - 1, - 15) -SDANY(ScaleRowDown2Box_Any_NEON, - ScaleRowDown2Box_NEON, - ScaleRowDown2Box_C, - 2, - 1, - 15) -SDODD(ScaleRowDown2Box_Odd_NEON, - ScaleRowDown2Box_NEON, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 15) -#endif -#ifdef HAS_SCALEUVROWDOWN2BOX_NEON -SDANY(ScaleUVRowDown2Box_Any_NEON, - ScaleUVRowDown2Box_NEON, - ScaleUVRowDown2Box_C, - 2, - 2, - 8) -#endif - -#ifdef HAS_SCALEROWDOWN2_MSA -SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) -SDANY(ScaleRowDown2Linear_Any_MSA, - ScaleRowDown2Linear_MSA, - ScaleRowDown2Linear_C, - 2, - 1, - 31) -SDANY(ScaleRowDown2Box_Any_MSA, - ScaleRowDown2Box_MSA, - ScaleRowDown2Box_C, - 2, - 1, - 31) -#endif -#ifdef HAS_SCALEROWDOWN2_MMI -SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7) -SDANY(ScaleRowDown2Linear_Any_MMI, - ScaleRowDown2Linear_MMI, - ScaleRowDown2Linear_C, - 2, - 1, - 7) -SDANY(ScaleRowDown2Box_Any_MMI, - ScaleRowDown2Box_MMI, - ScaleRowDown2Box_C, - 2, - 1, - 7) -SDODD(ScaleRowDown2Box_Odd_MMI, - ScaleRowDown2Box_MMI, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 7) -#endif -#ifdef HAS_SCALEROWDOWN4_SSSE3 -SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSSE3, - ScaleRowDown4Box_SSSE3, - ScaleRowDown4Box_C, - 4, - 1, - 7) -#endif -#ifdef HAS_SCALEROWDOWN4_AVX2 -SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) -SDANY(ScaleRowDown4Box_Any_AVX2, - ScaleRowDown4Box_AVX2, - ScaleRowDown4Box_C, - 4, - 1, - 15) -#endif -#ifdef HAS_SCALEROWDOWN4_NEON -SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_NEON, - ScaleRowDown4Box_NEON, - ScaleRowDown4Box_C, - 4, - 1, - 7) -#endif -#ifdef HAS_SCALEROWDOWN4_MSA -SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) -SDANY(ScaleRowDown4Box_Any_MSA, - ScaleRowDown4Box_MSA, - ScaleRowDown4Box_C, - 4, - 1, - 15) -#endif -#ifdef HAS_SCALEROWDOWN4_MMI -SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_MMI, - ScaleRowDown4Box_MMI, - ScaleRowDown4Box_C, - 4, - 1, - 7) -#endif -#ifdef HAS_SCALEROWDOWN34_SSSE3 -SDANY(ScaleRowDown34_Any_SSSE3, - ScaleRowDown34_SSSE3, - ScaleRowDown34_C, - 4 / 3, - 1, - 23) -SDANY(ScaleRowDown34_0_Box_Any_SSSE3, - ScaleRowDown34_0_Box_SSSE3, - ScaleRowDown34_0_Box_C, - 4 / 3, - 1, - 23) -SDANY(ScaleRowDown34_1_Box_Any_SSSE3, - ScaleRowDown34_1_Box_SSSE3, - ScaleRowDown34_1_Box_C, - 4 / 3, - 1, - 23) -#endif -#ifdef HAS_SCALEROWDOWN34_NEON -SDANY(ScaleRowDown34_Any_NEON, - ScaleRowDown34_NEON, - ScaleRowDown34_C, - 4 / 3, - 1, - 23) -SDANY(ScaleRowDown34_0_Box_Any_NEON, - ScaleRowDown34_0_Box_NEON, - ScaleRowDown34_0_Box_C, - 4 / 3, - 1, - 23) -SDANY(ScaleRowDown34_1_Box_Any_NEON, - ScaleRowDown34_1_Box_NEON, - ScaleRowDown34_1_Box_C, - 4 / 3, - 1, - 23) -#endif -#ifdef HAS_SCALEROWDOWN34_MSA -SDANY(ScaleRowDown34_Any_MSA, - ScaleRowDown34_MSA, - ScaleRowDown34_C, - 4 / 3, - 1, - 47) -SDANY(ScaleRowDown34_0_Box_Any_MSA, - ScaleRowDown34_0_Box_MSA, - ScaleRowDown34_0_Box_C, - 4 / 3, - 1, - 47) -SDANY(ScaleRowDown34_1_Box_Any_MSA, - ScaleRowDown34_1_Box_MSA, - ScaleRowDown34_1_Box_C, - 4 / 3, - 1, - 47) -#endif -#ifdef HAS_SCALEROWDOWN34_MMI -SDANY(ScaleRowDown34_Any_MMI, - ScaleRowDown34_MMI, - ScaleRowDown34_C, - 4 / 3, - 1, - 23) -#endif -#ifdef HAS_SCALEROWDOWN38_SSSE3 -SDANY(ScaleRowDown38_Any_SSSE3, - ScaleRowDown38_SSSE3, - ScaleRowDown38_C, - 8 / 3, - 1, - 11) -SDANY(ScaleRowDown38_3_Box_Any_SSSE3, - ScaleRowDown38_3_Box_SSSE3, - ScaleRowDown38_3_Box_C, - 8 / 3, - 1, - 5) -SDANY(ScaleRowDown38_2_Box_Any_SSSE3, - ScaleRowDown38_2_Box_SSSE3, - ScaleRowDown38_2_Box_C, - 8 / 3, - 1, - 5) -#endif -#ifdef HAS_SCALEROWDOWN38_NEON -SDANY(ScaleRowDown38_Any_NEON, - ScaleRowDown38_NEON, - ScaleRowDown38_C, - 8 / 3, - 1, - 11) -SDANY(ScaleRowDown38_3_Box_Any_NEON, - ScaleRowDown38_3_Box_NEON, - ScaleRowDown38_3_Box_C, - 8 / 3, - 1, - 11) -SDANY(ScaleRowDown38_2_Box_Any_NEON, - ScaleRowDown38_2_Box_NEON, - ScaleRowDown38_2_Box_C, - 8 / 3, - 1, - 11) -#endif -#ifdef HAS_SCALEROWDOWN38_MSA -SDANY(ScaleRowDown38_Any_MSA, - ScaleRowDown38_MSA, - ScaleRowDown38_C, - 8 / 3, - 1, - 11) -SDANY(ScaleRowDown38_3_Box_Any_MSA, - ScaleRowDown38_3_Box_MSA, - ScaleRowDown38_3_Box_C, - 8 / 3, - 1, - 11) -SDANY(ScaleRowDown38_2_Box_Any_MSA, - ScaleRowDown38_2_Box_MSA, - ScaleRowDown38_2_Box_C, - 8 / 3, - 1, - 11) -#endif - -#ifdef HAS_SCALEARGBROWDOWN2_SSE2 -SDANY(ScaleARGBRowDown2_Any_SSE2, - ScaleARGBRowDown2_SSE2, - ScaleARGBRowDown2_C, - 2, - 4, - 3) -SDANY(ScaleARGBRowDown2Linear_Any_SSE2, - ScaleARGBRowDown2Linear_SSE2, - ScaleARGBRowDown2Linear_C, - 2, - 4, - 3) -SDANY(ScaleARGBRowDown2Box_Any_SSE2, - ScaleARGBRowDown2Box_SSE2, - ScaleARGBRowDown2Box_C, - 2, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBROWDOWN2_NEON -SDANY(ScaleARGBRowDown2_Any_NEON, - ScaleARGBRowDown2_NEON, - ScaleARGBRowDown2_C, - 2, - 4, - 7) -SDANY(ScaleARGBRowDown2Linear_Any_NEON, - ScaleARGBRowDown2Linear_NEON, - ScaleARGBRowDown2Linear_C, - 2, - 4, - 7) -SDANY(ScaleARGBRowDown2Box_Any_NEON, - ScaleARGBRowDown2Box_NEON, - ScaleARGBRowDown2Box_C, - 2, - 4, - 7) -#endif -#ifdef HAS_SCALEARGBROWDOWN2_MSA -SDANY(ScaleARGBRowDown2_Any_MSA, - ScaleARGBRowDown2_MSA, - ScaleARGBRowDown2_C, - 2, - 4, - 3) -SDANY(ScaleARGBRowDown2Linear_Any_MSA, - ScaleARGBRowDown2Linear_MSA, - ScaleARGBRowDown2Linear_C, - 2, - 4, - 3) -SDANY(ScaleARGBRowDown2Box_Any_MSA, - ScaleARGBRowDown2Box_MSA, - ScaleARGBRowDown2Box_C, - 2, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBROWDOWN2_MMI -SDANY(ScaleARGBRowDown2_Any_MMI, - ScaleARGBRowDown2_MMI, - ScaleARGBRowDown2_C, - 2, - 4, - 1) -SDANY(ScaleARGBRowDown2Linear_Any_MMI, - ScaleARGBRowDown2Linear_MMI, - ScaleARGBRowDown2Linear_C, - 2, - 4, - 1) -SDANY(ScaleARGBRowDown2Box_Any_MMI, - ScaleARGBRowDown2Box_MMI, - ScaleARGBRowDown2Box_C, - 2, - 4, - 1) -#endif -#undef SDANY - -// Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8_t* dst_ptr, int dst_width) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ - dst_ptr + n * BPP, r); \ - } - -#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 -SDAANY(ScaleARGBRowDownEven_Any_SSE2, - ScaleARGBRowDownEven_SSE2, - ScaleARGBRowDownEven_C, - 4, - 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, - ScaleARGBRowDownEvenBox_SSE2, - ScaleARGBRowDownEvenBox_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON -SDAANY(ScaleARGBRowDownEven_Any_NEON, - ScaleARGBRowDownEven_NEON, - ScaleARGBRowDownEven_C, - 4, - 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, - ScaleARGBRowDownEvenBox_NEON, - ScaleARGBRowDownEvenBox_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA -SDAANY(ScaleARGBRowDownEven_Any_MSA, - ScaleARGBRowDownEven_MSA, - ScaleARGBRowDownEven_C, - 4, - 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, - ScaleARGBRowDownEvenBox_MSA, - ScaleARGBRowDownEvenBox_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI -SDAANY(ScaleARGBRowDownEven_Any_MMI, - ScaleARGBRowDownEven_MMI, - ScaleARGBRowDownEven_C, - 4, - 1) -SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, - ScaleARGBRowDownEvenBox_MMI, - ScaleARGBRowDownEvenBox_C, - 4, - 1) -#endif -#ifdef HAS_SCALEUVROWDOWNEVEN_NEON -SDAANY(ScaleUVRowDownEven_Any_NEON, - ScaleUVRowDownEven_NEON, - ScaleUVRowDownEven_C, - 2, - 3) -#endif - -#ifdef SASIMDONLY -// This also works and uses memcpy and SIMD instead of C, but is slower on ARM - -// Add rows box filter scale down. Using macro from row_any -#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint16_t dst_temp[32]); \ - SIMD_ALIGNED(uint8_t src_temp[32]); \ - memset(dst_temp, 0, 32 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(src_temp, dst_temp, MASK + 1); \ - memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ - } - -#ifdef HAS_SCALEADDROW_SSE2 -SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) -#endif -#ifdef HAS_SCALEADDROW_AVX2 -SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) -#endif -#ifdef HAS_SCALEADDROW_NEON -SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) -#endif -#ifdef HAS_SCALEADDROW_MSA -SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) -#endif -#ifdef HAS_SCALEADDROW_MMI -SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) -#endif -#undef SAANY - -#else - -// Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ - if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ - } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ - } - -#ifdef HAS_SCALEADDROW_SSE2 -SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) -#endif -#ifdef HAS_SCALEADDROW_AVX2 -SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) -#endif -#ifdef HAS_SCALEADDROW_NEON -SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) -#endif -#ifdef HAS_SCALEADDROW_MSA -SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) -#endif -#ifdef HAS_SCALEADDROW_MMI -SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) -#endif -#undef SAANY - -#endif // SASIMDONLY - -// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ - int dx) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ - } - -#ifdef HAS_SCALEFILTERCOLS_NEON -CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) -#endif -#ifdef HAS_SCALEFILTERCOLS_MSA -CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) -#endif -#ifdef HAS_SCALEARGBCOLS_NEON -CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) -#endif -#ifdef HAS_SCALEARGBCOLS_MSA -CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) -#endif -#ifdef HAS_SCALEARGBCOLS_MMI -CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, - ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_MSA -CANY(ScaleARGBFilterCols_Any_MSA, - ScaleARGBFilterCols_MSA, - ScaleARGBFilterCols_C, - 4, - 7) -#endif -#undef CANY - -// Scale up horizontally 2 times using linear filter. -#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - dst_ptr[0] = src_ptr[0]; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(src_ptr, dst_ptr + 1, n); \ - } \ - C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ - } \ - dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \ - } - -// Even the C versions need to be wrapped, because boundary pixels have to -// be handled differently - -SUH2LANY(ScaleRowUp2_Linear_Any_C, - ScaleRowUp2_Linear_C, - ScaleRowUp2_Linear_C, - 0, - uint8_t) - -SUH2LANY(ScaleRowUp2_Linear_16_Any_C, - ScaleRowUp2_Linear_16_C, - ScaleRowUp2_Linear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 -SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, - ScaleRowUp2_Linear_SSE2, - ScaleRowUp2_Linear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 -SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, - ScaleRowUp2_Linear_SSSE3, - ScaleRowUp2_Linear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 -SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, - ScaleRowUp2_Linear_12_SSSE3, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 -SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, - ScaleRowUp2_Linear_16_SSE2, - ScaleRowUp2_Linear_16_C, - 7, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 -SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, - ScaleRowUp2_Linear_AVX2, - ScaleRowUp2_Linear_C, - 31, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 -SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, - ScaleRowUp2_Linear_12_AVX2, - ScaleRowUp2_Linear_16_C, - 31, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 -SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, - ScaleRowUp2_Linear_16_AVX2, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_NEON -SUH2LANY(ScaleRowUp2_Linear_Any_NEON, - ScaleRowUp2_Linear_NEON, - ScaleRowUp2_Linear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_NEON -SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, - ScaleRowUp2_Linear_12_NEON, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_NEON -SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, - ScaleRowUp2_Linear_16_NEON, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) -#endif - -#undef SUH2LANY - -// Scale up 2 times using bilinear filter. -// This function produces 2 rows at a time. -#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ - ptrdiff_t dst_stride, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - const PTYPE* sa = src_ptr; \ - const PTYPE* sb = src_ptr + src_stride; \ - PTYPE* da = dst_ptr; \ - PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ - db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(sa, sb - sa, da + 1, db - da, n); \ - } \ - C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ - } \ - da[dst_width - 1] = \ - (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ - db[dst_width - 1] = \ - (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ - } - -SU2BLANY(ScaleRowUp2_Bilinear_Any_C, - ScaleRowUp2_Bilinear_C, - ScaleRowUp2_Bilinear_C, - 0, - uint8_t) - -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, - ScaleRowUp2_Bilinear_16_C, - ScaleRowUp2_Bilinear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 -SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, - ScaleRowUp2_Bilinear_SSE2, - ScaleRowUp2_Bilinear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 -SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, - ScaleRowUp2_Bilinear_12_SSSE3, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, - ScaleRowUp2_Bilinear_16_SSE2, - ScaleRowUp2_Bilinear_16_C, - 7, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 -SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, - ScaleRowUp2_Bilinear_SSSE3, - ScaleRowUp2_Bilinear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 -SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, - ScaleRowUp2_Bilinear_AVX2, - ScaleRowUp2_Bilinear_C, - 31, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 -SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, - ScaleRowUp2_Bilinear_12_AVX2, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, - ScaleRowUp2_Bilinear_16_AVX2, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_NEON -SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, - ScaleRowUp2_Bilinear_NEON, - ScaleRowUp2_Bilinear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON -SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, - ScaleRowUp2_Bilinear_12_NEON, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, - ScaleRowUp2_Bilinear_16_NEON, - ScaleRowUp2_Bilinear_16_C, - 7, - uint16_t) -#endif - -#undef SU2BLANY - -// Scale bi-planar plane up horizontally 2 times using linear filter. -#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - dst_ptr[0] = src_ptr[0]; \ - dst_ptr[1] = src_ptr[1]; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(src_ptr, dst_ptr + 2, n); \ - } \ - C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ - } \ - dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ - dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ - } - -SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, - ScaleUVRowUp2_Linear_C, - ScaleUVRowUp2_Linear_C, - 0, - uint8_t) - -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, - ScaleUVRowUp2_Linear_16_C, - ScaleUVRowUp2_Linear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 -SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, - ScaleUVRowUp2_Linear_SSSE3, - ScaleUVRowUp2_Linear_C, - 7, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 -SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, - ScaleUVRowUp2_Linear_AVX2, - ScaleUVRowUp2_Linear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2, - ScaleUVRowUp2_Linear_16_SSE2, - ScaleUVRowUp2_Linear_16_C, - 3, - uint16_t) -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, - ScaleUVRowUp2_Linear_16_AVX2, - ScaleUVRowUp2_Linear_16_C, - 7, - uint16_t) -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_NEON -SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, - ScaleUVRowUp2_Linear_NEON, - ScaleUVRowUp2_Linear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, - ScaleUVRowUp2_Linear_16_NEON, - ScaleUVRowUp2_Linear_16_C, - 15, - uint16_t) -#endif - -#undef SBUH2LANY - -// Scale bi-planar plane up 2 times using bilinear filter. -// This function produces 2 rows at a time. -#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ - ptrdiff_t dst_stride, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - const PTYPE* sa = src_ptr; \ - const PTYPE* sb = src_ptr + src_stride; \ - PTYPE* da = dst_ptr; \ - PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ - db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ - da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ - db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(sa, sb - sa, da + 2, db - da, n); \ - } \ - C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ - } \ - da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ - sb[((dst_width + 1) & ~1) - 2] + 2) >> \ - 2; \ - db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ - 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ - 2; \ - da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ - sb[((dst_width + 1) & ~1) - 1] + 2) >> \ - 2; \ - db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ - 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ - 2; \ - } - -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, - ScaleUVRowUp2_Bilinear_C, - ScaleUVRowUp2_Bilinear_C, - 0, - uint8_t) - -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, - ScaleUVRowUp2_Bilinear_16_C, - ScaleUVRowUp2_Bilinear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, - ScaleUVRowUp2_Bilinear_SSSE3, - ScaleUVRowUp2_Bilinear_C, - 7, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, - ScaleUVRowUp2_Bilinear_AVX2, - ScaleUVRowUp2_Bilinear_C, - 15, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2, - ScaleUVRowUp2_Bilinear_16_SSE2, - ScaleUVRowUp2_Bilinear_16_C, - 7, - uint16_t) -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, - ScaleUVRowUp2_Bilinear_16_AVX2, - ScaleUVRowUp2_Bilinear_16_C, - 7, - uint16_t) -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, - ScaleUVRowUp2_Bilinear_NEON, - ScaleUVRowUp2_Bilinear_C, - 7, - uint8_t) -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, - ScaleUVRowUp2_Bilinear_16_NEON, - ScaleUVRowUp2_Bilinear_16_C, - 7, - uint16_t) -#endif - -#undef SBU2BLANY - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_argb.cc b/thirdparty/libyuv/source/scale_argb.cc deleted file mode 100644 index 451d4ec..0000000 --- a/thirdparty/libyuv/source/scale_argb.cc +++ /dev/null @@ -1,1091 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyARGB -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// ScaleARGB ARGB, 1/2 -// This is an optimized version for scaling down a ARGB to 1/2 of -// its original size. -static void ScaleARGBDown2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8_t* dst_argb, int dst_width) = - filtering == kFilterNone - ? ScaleARGBRowDown2_C - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C - : ScaleARGBRowDown2Box_C); - (void)src_width; - (void)src_height; - (void)dx; - assert(dx == 65536 * 2); // Test scale factor of 2. - assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. - // Advance to odd row, even column. - if (filtering == kFilterBilinear) { - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - } else { - src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; - } - -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_Any_SSE2 - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 - : ScaleARGBRowDown2Box_Any_SSE2); - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_SSE2 - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 - : ScaleARGBRowDown2Box_SSE2); - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_Any_NEON - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON - : ScaleARGBRowDown2Box_Any_NEON); - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_NEON - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON - : ScaleARGBRowDown2Box_NEON); - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI - : ScaleARGBRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 2)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_MMI - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI - : ScaleARGBRowDown2Box_MMI); - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWN2_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_Any_MSA - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA - : ScaleARGBRowDown2Box_Any_MSA); - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = - filtering == kFilterNone - ? ScaleARGBRowDown2_MSA - : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA - : ScaleARGBRowDown2Box_MSA); - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (j = 0; j < dst_height; ++j) { - ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); - src_argb += row_stride; - dst_argb += dst_stride; - } -} - -// ScaleARGB ARGB, 1/4 -// This is an optimized version for scaling down a ARGB to 1/4 of -// its original size. -static void ScaleARGBDown4Box(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy) { - int j; - // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); - int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8_t* dst_argb, int dst_width) = - ScaleARGBRowDown2Box_C; - // Advance to odd row, even column. - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - (void)src_width; - (void)src_height; - (void)dx; - assert(dx == 65536 * 4); // Test scale factor of 4. - assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; - } - } -#endif - - for (j = 0; j < dst_height; ++j) { - ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); - ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, - dst_width * 2); - ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); - src_argb += row_stride; - dst_argb += dst_stride; - } - free_aligned_buffer_64(row); -} - -// ScaleARGB ARGB Even -// This is an optimized version for scaling down a ARGB to even -// multiple of its original size. -static void ScaleARGBDownEven(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - int col_step = dx >> 16; - int row_stride = (dy >> 16) * src_stride; - void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, - int src_step, uint8_t* dst_argb, int dst_width) = - filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; - (void)src_width; - (void)src_height; - assert(IS_ALIGNED(src_width, 2)); - assert(IS_ALIGNED(src_height, 2)); - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 - : ScaleARGBRowDownEven_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON - : ScaleARGBRowDownEven_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI - : ScaleARGBRowDownEven_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { - ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI; - } - } -#endif -#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA - : ScaleARGBRowDownEven_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = - filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (j = 0; j < dst_height; ++j) { - ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); - src_argb += row_stride; - dst_argb += dst_stride; - } -} - -// Scale ARGB down with bilinear interpolation. -static void ScaleARGBBilinearDown(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; - int64_t xlast = x + (int64_t)(dst_width - 1) * dx; - int64_t xl = (dx >= 0) ? x : xlast; - int64_t xr = (dx >= 0) ? xlast : x; - int clip_src_width; - xl = (xl >> 16) & ~3; // Left edge aligned. - xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. - xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. - if (xr > src_width) { - xr = src_width; - } - clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. - src_argb += xl * 4; - x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(clip_src_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(clip_src_width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; - } - } -#endif - // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. - // Allocate a row of ARGB. - { - align_buffer_64(row, clip_src_width * 4); - - const int max_y = (src_height - 1) << 16; - if (y > max_y) { - y = max_y; - } - for (j = 0; j < dst_height; ++j) { - int yi = y >> 16; - const uint8_t* src = src_argb + yi * src_stride; - if (filtering == kFilterLinear) { - ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(row, src, src_stride, clip_src_width, yf); - ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); - } - dst_argb += dst_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - } - free_aligned_buffer_64(row); - } -} - -// Scale ARGB up with bilinear interpolation. -static void ScaleARGBBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, - int dst_width, int x, int dx) = - filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; - const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - if (src_width >= 32768) { - ScaleARGBFilterCols = - filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; - } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_NEON) - if (filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_MSA) - if (filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_SSE2) - if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLS_NEON) - if (!filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (!filtering && TestCpuFlag(kCpuHasMMI)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBFilterCols = ScaleARGBCols_MMI; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (!filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBCols_MSA; - } - } -#endif - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; - } -#endif - } - - if (y > max_y) { - y = max_y; - } - - { - int yi = y >> 16; - const uint8_t* src = src_argb + yi * src_stride; - - // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); - - uint8_t* rowptr = row; - int rowstride = kRowSize; - int lasty = yi; - - ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); - if (src_height > 1) { - src += src_stride; - } - ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - src = src_argb + yi * src_stride; - } - if (yi != lasty) { - ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - src += src_stride; - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); - } - dst_argb += dst_stride; - y += dy; - } - free_aligned_buffer_64(row); - } -} - -#ifdef YUVSCALEUP -// Scale YUV to ARGB up with bilinear interpolation. -static void ScaleYUVToARGBBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride_y, - int src_stride_u, - int src_stride_v, - int dst_stride_argb, - const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, int width) = - I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(src_width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I422ToARGBRow = I422ToARGBRow_Any_MMI; - if (IS_ALIGNED(src_width, 4)) { - I422ToARGBRow = I422ToARGBRow_MMI; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; - } - } -#endif - - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, - int dst_width, int x, int dx) = - filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; - if (src_width >= 32768) { - ScaleARGBFilterCols = - filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; - } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_NEON) - if (filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBFILTERCOLS_MSA) - if (filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_SSE2) - if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLS_NEON) - if (!filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (!filtering && TestCpuFlag(kCpuHasMMI)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBFilterCols = ScaleARGBCols_MMI; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (!filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBCols_MSA; - } - } -#endif - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; - } -#endif - } - - const int max_y = (src_height - 1) << 16; - if (y > max_y) { - y = max_y; - } - const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. - int yi = y >> 16; - int uv_yi = yi >> kYShift; - const uint8_t* src_row_y = src_y + yi * src_stride_y; - const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; - const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; - - // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); - - // Allocate 1 row of ARGB for source conversion. - align_buffer_64(argb_row, src_width * 4); - - uint8_t* rowptr = row; - int rowstride = kRowSize; - int lasty = yi; - - // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. - ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); - if (src_height > 1) { - src_row_y += src_stride_y; - if (yi & 1) { - src_row_u += src_stride_u; - src_row_v += src_stride_v; - } - } - ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); - if (src_height > 2) { - src_row_y += src_stride_y; - if (!(yi & 1)) { - src_row_u += src_stride_u; - src_row_v += src_stride_v; - } - } - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - uv_yi = yi >> kYShift; - src_row_y = src_y + yi * src_stride_y; - src_row_u = src_u + uv_yi * src_stride_u; - src_row_v = src_v + uv_yi * src_stride_v; - } - if (yi != lasty) { - // TODO(fbarchard): Convert the clipped region of row. - I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); - ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - src_row_y += src_stride_y; - if (yi & 1) { - src_row_u += src_stride_u; - src_row_v += src_stride_v; - } - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); - } - dst_argb += dst_stride_argb; - y += dy; - } - free_aligned_buffer_64(row); - free_aligned_buffer_64(row_argb); -} -#endif - -// Scale ARGB to/from any dimensions, without interpolation. -// Fixed point math is used for performance: The upper 16 bits -// of x and dx is the integer part of the source position and -// the lower 16 bits are the fixed decimal part. - -static void ScaleARGBSimple(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int dx, - int y, - int dy) { - int j; - void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; - (void)src_height; -#if defined(HAS_SCALEARGBCOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBCols = ScaleARGBCols_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBCols = ScaleARGBCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBCols = ScaleARGBCols_NEON; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleARGBCols = ScaleARGBCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleARGBCols = ScaleARGBCols_MMI; - } - } -#endif -#if defined(HAS_SCALEARGBCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleARGBCols = ScaleARGBCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBCols = ScaleARGBCols_MSA; - } - } -#endif - if (src_width * 2 == dst_width && x < 0x8000) { - ScaleARGBCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBCols = ScaleARGBColsUp2_SSE2; - } -#endif -#if defined(HAS_SCALEARGBCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBCols = ScaleARGBColsUp2_MMI; - } -#endif - } - - for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, - dx); - dst_argb += dst_stride; - y += dy; - } -} - -// ScaleARGB a ARGB. -// This function in turn calls a scaling function -// suitable for handling the desired resolutions. -static void ScaleARGB(const uint8_t* src, - int src_stride, - int src_width, - int src_height, - uint8_t* dst, - int dst_stride, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering) { - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - // ARGB does not support box filter yet, but allow the user to pass it. - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative src_height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * src_stride; - src_stride = -src_stride; - } - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - if (clip_x) { - int64_t clipf = (int64_t)(clip_x)*dx; - x += (clipf & 0xffff); - src += (clipf >> 16) * 4; - dst += clip_x * 4; - } - if (clip_y) { - int64_t clipf = (int64_t)(clip_y)*dy; - y += (clipf & 0xffff); - src += (clipf >> 16) * src_stride; - dst += clip_y * dst_stride; - } - - // Special case for integer step values. - if (((dx | dy) & 0xffff) == 0) { - if (!dx || !dy) { // 1 pixel wide and/or tall. - filtering = kFilterNone; - } else { - // Optimized even scale down. ie 2, 4, 6, 8, 10x. - if (!(dx & 0x10000) && !(dy & 0x10000)) { - if (dx == 0x20000) { - // Optimized 1/2 downsample. - ScaleARGBDown2(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } - if (dx == 0x40000 && filtering == kFilterBox) { - // Optimized 1/4 box downsample. - ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy); - return; - } - ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } - // Optimized odd scale down. ie 3, 5, 7, 9x. - if ((dx & 0x10000) && (dy & 0x10000)) { - filtering = kFilterNone; - if (dx == 0x10000 && dy == 0x10000) { - // Straight copy. - ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, - dst, dst_stride, clip_width, clip_height); - return; - } - } - } - } - if (dx == 0x10000 && (x & 0xffff) == 0) { - // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst, x, y, dy, 4, filtering); - return; - } - if (filtering && dy < 65536) { - ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } - if (filtering) { - ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst, x, dx, y, dy); -} - -LIBYUV_API -int ARGBScaleClip(const uint8_t* src_argb, - int src_stride_argb, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || - dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || - clip_width > 32768 || clip_height > 32768 || - (clip_x + clip_width) > dst_width || - (clip_y + clip_height) > dst_height) { - return -1; - } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, - dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, - clip_height, filtering); - return 0; -} - -// Scale an ARGB image. -LIBYUV_API -int ARGBScale(const uint8_t* src_argb, - int src_stride_argb, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || - src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { - return -1; - } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, - dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, - filtering); - return 0; -} - -// Scale with YUV conversion to ARGB and clipping. -LIBYUV_API -int YUVToARGBScaleClip(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint32_t src_fourcc, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - uint32_t dst_fourcc, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering) { - uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); - int r; - (void)src_fourcc; // TODO(fbarchard): implement and/or assert. - (void)dst_fourcc; - I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - argb_buffer, src_width * 4, src_width, src_height); - - r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, - dst_stride_argb, dst_width, dst_height, clip_x, clip_y, - clip_width, clip_height, filtering); - free(argb_buffer); - return r; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_common.cc b/thirdparty/libyuv/source/scale_common.cc deleted file mode 100644 index da96d42..0000000 --- a/thirdparty/libyuv/source/scale_common.cc +++ /dev/null @@ -1,1769 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyARGB -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// CPU agnostic row functions -void ScaleRowDown2_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[1]; - dst[1] = src_ptr[3]; - dst += 2; - src_ptr += 4; - } - if (dst_width & 1) { - dst[0] = src_ptr[1]; - } -} - -void ScaleRowDown2_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[1]; - dst[1] = src_ptr[3]; - dst += 2; - src_ptr += 4; - } - if (dst_width & 1) { - dst[0] = src_ptr[1]; - } -} - -void ScaleRowDown2Linear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + 1) >> 1; - dst[1] = (s[2] + s[3] + 1) >> 1; - dst += 2; - s += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + 1) >> 1; - } -} - -void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* s = src_ptr; - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + 1) >> 1; - dst[1] = (s[2] + s[3] + 1) >> 1; - dst += 2; - s += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + 1) >> 1; - } -} - -void ScaleRowDown2Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - } -} - -void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - dst_width -= 1; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst += 1; - s += 2; - t += 2; - } - dst[0] = (s[0] + t[0] + 1) >> 1; -} - -void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - } -} - -void ScaleRowDown4_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[2]; - dst[1] = src_ptr[6]; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = src_ptr[2]; - } -} - -void ScaleRowDown4_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[2]; - dst[1] = src_ptr[6]; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = src_ptr[2]; - } -} - -void ScaleRowDown4Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - intptr_t stride = src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + - src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + - src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + - src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + - src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + - src_ptr[stride * 3 + 7] + 8) >> - 4; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - } -} - -void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - intptr_t stride = src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + - src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + - src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + - src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + - src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + - src_ptr[stride * 3 + 7] + 8) >> - 4; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - } -} - -void ScaleRowDown34_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } -} - -void ScaleRowDown34_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - int x; - (void)src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } -} - -// Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } -} - -void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* d, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } -} - -// Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } -} - -void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* d, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } -} - -// Sample position: (O is src sample position, X is dst sample position) -// -// v dst_ptr at here v stop at here -// X O X X O X X O X X O X X O X -// ^ src_ptr at here -void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; - dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; - } -} - -// Sample position: (O is src sample position, X is dst sample position) -// -// src_ptr at here -// X v X X X X X X X X X -// O O O O O -// X X X X X X X X X X -// ^ dst_ptr at here ^ stop at here -// X X X X X X X X X X -// O O O O O -// X X X X X X X X X X -void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[2 * x + 0] = - (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; - d[2 * x + 1] = - (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 0] = - (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 1] = - (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; - } -} - -// Only suitable for at most 14 bit range. -void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; - dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; - } -} - -// Only suitable for at most 12bit range. -void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - uint16_t* d = dst_ptr; - uint16_t* e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[2 * x + 0] = - (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; - d[2 * x + 1] = - (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 0] = - (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 1] = - (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; - } -} - -// Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[0] = src_ptr[x >> 16]; - x += dx; - dst_ptr[1] = src_ptr[x >> 16]; - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[x >> 16]; - } -} - -void ScaleCols_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[0] = src_ptr[x >> 16]; - x += dx; - dst_ptr[1] = src_ptr[x >> 16]; - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - (void)x; - (void)dx; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[1] = dst_ptr[0] = src_ptr[0]; - src_ptr += 1; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[0]; - } -} - -void ScaleColsUp2_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - (void)x; - (void)dx; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[1] = dst_ptr[0] = src_ptr[0]; - src_ptr += 1; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[0]; - } -} - -// (1-f)a + fb can be replaced with a + f(b-a) -#if defined(__arm__) || defined(__aarch64__) -#define BLENDER(a, b, f) \ - (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -#else -// Intel uses 7 bit math with rounding. -#define BLENDER(a, b, f) \ - (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) -#endif - -void ScaleFilterCols_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -void ScaleFilterCols64_C(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} -#undef BLENDER - -// Same as 8 bit arm blender but return is cast to uint16_t -#define BLENDER(a, b, f) \ - (uint16_t)( \ - (int)(a) + \ - (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) - -void ScaleFilterCols_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -void ScaleFilterCols64_16_C(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} -#undef BLENDER - -void ScaleRowDown38_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - assert(dst_width % 3 == 0); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -void ScaleRowDown38_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - int x; - (void)src_stride; - assert(dst_width % 3 == 0); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -// 8x3 -> 3x1 -void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - int x; - assert(src_width > 0); - for (x = 0; x < src_width - 1; x += 2) { - dst_ptr[0] += src_ptr[0]; - dst_ptr[1] += src_ptr[1]; - src_ptr += 2; - dst_ptr += 2; - } - if (src_width & 1) { - dst_ptr[0] += src_ptr[0]; - } -} - -void ScaleAddRow_16_C(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width) { - int x; - assert(src_width > 0); - for (x = 0; x < src_width - 1; x += 2) { - dst_ptr[0] += src_ptr[0]; - dst_ptr[1] += src_ptr[1]; - src_ptr += 2; - dst_ptr += 2; - } - if (src_width & 1) { - dst_ptr[0] += src_ptr[0]; - } -} - -// ARGB scale row functions - -void ScaleARGBRowDown2_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 4; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; - } -} - -void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; - dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; - dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; - dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; - src_argb += 8; - dst_argb += 4; - } -} - -void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + - src_argb[src_stride + 4] + 2) >> - 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + - src_argb[src_stride + 5] + 2) >> - 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + - src_argb[src_stride + 6] + 2) >> - 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + - src_argb[src_stride + 7] + 2) >> - 2; - src_argb += 8; - dst_argb += 4; - } -} - -void ScaleARGBRowDownEven_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - (void)src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[src_stepx]; - src += src_stepx * 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + - src_argb[src_stride + 4] + 2) >> - 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + - src_argb[src_stride + 5] + 2) >> - 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + - src_argb[src_stride + 6] + 2) >> - 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + - src_argb[src_stride + 7] + 2) >> - 2; - src_argb += src_stepx * 4; - dst_argb += 4; - } -} - -// Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[x >> 16]; - } -} - -void ScaleARGBCols64_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - (void)x; - (void)dx; - for (j = 0; j < dst_width - 1; j += 2) { - dst[1] = dst[0] = src[0]; - src += 1; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. -// Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 -#define BLENDERC(a, b, f, s) \ - (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ - BLENDERC(a, b, f, 0) - -void ScaleARGBFilterCols_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32_t a = src[xi]; - uint32_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32_t a = src[xi]; - uint32_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} - -void ScaleARGBFilterCols64_C(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int64_t xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32_t a = src[xi]; - uint32_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int64_t xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32_t a = src[xi]; - uint32_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} -#undef BLENDER1 -#undef BLENDERC -#undef BLENDER - -// UV scale row functions -// same as ARGB but 2 channels - -void ScaleUVRowDown2_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int x; - (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; - } -} - -void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - int x; - (void)src_stride; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; - dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; - src_uv += 4; - dst_uv += 2; - } -} - -void ScaleUVRowDown2Box_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + - src_uv[src_stride + 2] + 2) >> - 2; - dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + - src_uv[src_stride + 3] + 2) >> - 2; - src_uv += 4; - dst_uv += 2; - } -} - -void ScaleUVRowDownEven_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - (void)src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[src_stepx]; - src += src_stepx * 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + - src_uv[src_stride + 2] + 2) >> - 2; - dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + - src_uv[src_stride + 3] + 2) >> - 2; - src_uv += src_stepx * 2; - dst_uv += 2; - } -} - -void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[4 * x + 0] = - (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; - dst_ptr[4 * x + 1] = - (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; - dst_ptr[4 * x + 2] = - (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; - dst_ptr[4 * x + 3] = - (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; - } -} - -void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - uint8_t* d = dst_ptr; - uint8_t* e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 1 + 8) >> - 4; - d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 1 + 8) >> - 4; - d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + - t[2 * x + 2] * 3 + 8) >> - 4; - d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + - t[2 * x + 2] * 3 + 8) >> - 4; - e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 9 + 8) >> - 4; - e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 9 + 8) >> - 4; - } -} - -void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[4 * x + 0] = - (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; - dst_ptr[4 * x + 1] = - (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; - dst_ptr[4 * x + 2] = - (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; - dst_ptr[4 * x + 3] = - (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; - } -} - -void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - uint16_t* d = dst_ptr; - uint16_t* e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 1 + 8) >> - 4; - d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 1 + 8) >> - 4; - d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + - t[2 * x + 2] * 3 + 8) >> - 4; - d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + - t[2 * x + 2] * 3 + 8) >> - 4; - e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 9 + 8) >> - 4; - e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 9 + 8) >> - 4; - } -} - -// Scales a single row of pixels using point sampling. -void ScaleUVCols_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x, - int dx) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[x >> 16]; - } -} - -void ScaleUVCols64_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleUVColsUp2_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x, - int dx) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int j; - (void)x; - (void)dx; - for (j = 0; j < dst_width - 1; j += 2) { - dst[1] = dst[0] = src[0]; - src += 1; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. -// Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 -#define BLENDERC(a, b, f, s) \ - (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) - -void ScaleUVFilterCols_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x, - int dx) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} - -void ScaleUVFilterCols64_C(uint8_t* dst_uv, - const uint8_t* src_uv, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t)(x32); - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int64_t xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int64_t xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} -#undef BLENDER1 -#undef BLENDERC -#undef BLENDER - -// Scale plane vertically with bilinear interpolation. -void ScalePlaneVertical(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_argb, - uint8_t* dst_argb, - int x, - int y, - int dy, - int bpp, - enum FilterMode filtering) { - // TODO(fbarchard): Allow higher bpp. - int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - int j; - assert(bpp >= 1 && bpp <= 4); - assert(src_height != 0); - assert(dst_width > 0); - assert(dst_height > 0); - src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width_bytes, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width_bytes, 8)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width_bytes, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - for (j = 0; j < dst_height; ++j) { - int yi; - int yf; - if (y > max_y) { - y = max_y; - } - yi = y >> 16; - yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, - dst_width_bytes, yf); - dst_argb += dst_stride; - y += dy; - } -} -void ScalePlaneVertical_16(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_argb, - uint16_t* dst_argb, - int x, - int y, - int dy, - int wpp, - enum FilterMode filtering) { - // TODO(fbarchard): Allow higher wpp. - int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_16_C; - const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - int j; - assert(wpp >= 1 && wpp <= 2); - assert(src_height != 0); - assert(dst_width > 0); - assert(dst_height > 0); - src_argb += (x >> 16) * wpp; -#if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_16_SSE2; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_16_SSSE3; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_16_AVX2; - if (IS_ALIGNED(dst_width_bytes, 32)) { - InterpolateRow = InterpolateRow_16_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_16_NEON; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_16_NEON; - } - } -#endif - for (j = 0; j < dst_height; ++j) { - int yi; - int yf; - if (y > max_y) { - y = max_y; - } - yi = y >> 16; - yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, - dst_width_words, yf); - dst_argb += dst_stride; - y += dy; - } -} - -// Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (src_width < 0) { - src_width = -src_width; - } - if (src_height < 0) { - src_height = -src_height; - } - if (filtering == kFilterBox) { - // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. - if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { - filtering = kFilterBilinear; - } - } - if (filtering == kFilterBilinear) { - if (src_height == 1) { - filtering = kFilterLinear; - } - // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. - if (dst_height == src_height || dst_height * 3 == src_height) { - filtering = kFilterLinear; - } - // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to - // avoid reading 2 pixels horizontally that causes memory exception. - if (src_width == 1) { - filtering = kFilterNone; - } - } - if (filtering == kFilterLinear) { - if (src_width == 1) { - filtering = kFilterNone; - } - // TODO(fbarchard): Detect any odd scale factor and reduce to None. - if (dst_width == src_width || dst_width * 3 == src_width) { - filtering = kFilterNone; - } - } - return filtering; -} - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_C(int num, int div) { - return (int)(((int64_t)(num) << 16) / div); -} - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv1_C(int num, int div) { - return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); -} - -#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) - -// Compute slope values for stepping. -void ScaleSlope(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering, - int* x, - int* y, - int* dx, - int* dy) { - assert(x != NULL); - assert(y != NULL); - assert(dx != NULL); - assert(dy != NULL); - assert(src_width != 0); - assert(src_height != 0); - assert(dst_width > 0); - assert(dst_height > 0); - // Check for 1 pixel and avoid FixedDiv overflow. - if (dst_width == 1 && src_width >= 32768) { - dst_width = src_width; - } - if (dst_height == 1 && src_height >= 32768) { - dst_height = src_height; - } - if (filtering == kFilterBox) { - // Scale step for point sampling duplicates all pixels equally. - *dx = FixedDiv(Abs(src_width), dst_width); - *dy = FixedDiv(src_height, dst_height); - *x = 0; - *y = 0; - } else if (filtering == kFilterBilinear) { - // Scale step for bilinear sampling renders last pixel once for upsample. - if (dst_width <= Abs(src_width)) { - *dx = FixedDiv(Abs(src_width), dst_width); - *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_width > 1) { - *dx = FixedDiv1(Abs(src_width), dst_width); - *x = 0; - } - if (dst_height <= src_height) { - *dy = FixedDiv(src_height, dst_height); - *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_height > 1) { - *dy = FixedDiv1(src_height, dst_height); - *y = 0; - } - } else if (filtering == kFilterLinear) { - // Scale step for bilinear sampling renders last pixel once for upsample. - if (dst_width <= Abs(src_width)) { - *dx = FixedDiv(Abs(src_width), dst_width); - *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (dst_width > 1) { - *dx = FixedDiv1(Abs(src_width), dst_width); - *x = 0; - } - *dy = FixedDiv(src_height, dst_height); - *y = *dy >> 1; - } else { - // Scale step for point sampling duplicates all pixels equally. - *dx = FixedDiv(Abs(src_width), dst_width); - *dy = FixedDiv(src_height, dst_height); - *x = CENTERSTART(*dx, 0); - *y = CENTERSTART(*dy, 0); - } - // Negative src_width means horizontally mirror. - if (src_width < 0) { - *x += (dst_width - 1) * *dx; - *dx = -*dx; - // src_width = -src_width; // Caller must do this. - } -} -#undef CENTERSTART - -// Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_C(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src2 = src_ptr + src_stride; - - int x; - for (x = 0; x < dst_width - 1; x += 2) { - uint16_t p0 = src_ptr[0]; - uint16_t p1 = src_ptr[1]; - uint16_t p2 = src2[0]; - uint16_t p3 = src2[1]; - dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; - dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; - ++src_ptr; - ++src2; - dst += 2; - } - if (dst_width & 1) { - uint16_t p0 = src_ptr[0]; - uint16_t p1 = src_ptr[1]; - uint16_t p2 = src2[0]; - uint16_t p3 = src2[1]; - dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_gcc.cc b/thirdparty/libyuv/source/scale_gcc.cc deleted file mode 100644 index ebc6deb..0000000 --- a/thirdparty/libyuv/source/scale_gcc.cc +++ /dev/null @@ -1,2948 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) - -// Offsets for source bytes 0 to 9 -static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 0 to 10 -static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, - 8, 9, 9, 10, 10, 11, 12, 13}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; - -// Coefficients for source bytes 0 to 10 -static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; - -// Coefficients for source bytes 10 to 21 -static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; - -// Coefficients for source bytes 21 to 31 -static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; - -// Coefficients for source bytes 21 to 31 -static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; - -static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 0,1,2 -static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 3,4,5 -static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x3 and 2x3 -static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; - -// Arrange first value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; - -// Arrange second value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; - -// Arrange third value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x2 and 2x2 -static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; - -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt - -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#ifdef HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown2_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile(LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SCALEROWDOWN2_AVX2 - -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} - -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - intptr_t stridex3; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea 0x00(%4,%4,2),%3 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#ifdef HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown4_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} - -void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(src_stride * 3)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_SCALEROWDOWN4_AVX2 - -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} - -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, - 10, 11, 8, 9, 14, 15, 12, 13}; - -static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, - 3, 1, 1, 3, 3, 1, 1, 3}; - -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $1,%%xmm6 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm6,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) - "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) - - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm6,%%xmm1 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - LABELALIGN - "1: \n" - "pxor %%xmm0,%%xmm0 \n" // 0 - // above line - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" // near+far - "movdqa %%xmm3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" // 2*near - "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) - - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - // below line - "movq (%0,%3),%%xmm6 \n" // 01234567 - "movq 1(%0,%3),%%xmm2 \n" // 12345678 - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm6,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) - "paddw %%xmm7,%%xmm5 \n" // near+far - "movdqa %%xmm3,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) - "paddw %%xmm7,%%xmm7 \n" // 2*near - "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) - - "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm6,%%xmm2 \n" // near+far - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) - - // xmm4 xmm1 - // xmm5 xmm2 - "pcmpeqw %%xmm0,%%xmm0 \n" - "psrlw $15,%%xmm0 \n" - "psllw $3,%%xmm0 \n" // all 8 - - "movdqa %%xmm4,%%xmm3 \n" - "movdqa %%xmm5,%%xmm6 \n" - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - - "movdqa %%xmm1,%%xmm7 \n" - "movdqa %%xmm2,%%xmm6 \n" - "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) - "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm7 \n" // ^ div by 16 - - "packuswb %%xmm7,%%xmm3 \n" - "movdqu %%xmm3,(%1) \n" // save above line - - "movdqa %%xmm5,%%xmm3 \n" - "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) - "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 - - "movdqa %%xmm2,%%xmm3 \n" - "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) - "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) - "psrlw $4,%%xmm2 \n" // ^ div by 16 - - "packuswb %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // save below line - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %3,%%xmm5 \n" - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) - - "paddw %%xmm4,%%xmm1 \n" // far+2 - "paddw %%xmm4,%%xmm3 \n" // far+2 - "paddw %%xmm0,%%xmm1 \n" // near+far+2 - "paddw %%xmm2,%%xmm3 \n" // near+far+2 - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) - - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,16(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" - "psllw $3,%%xmm7 \n" // all 8 - "movdqa %5,%%xmm6 \n" - - LABELALIGN - "1: \n" - // above line - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) - "paddw %%xmm0,%%xmm1 \n" // near+far - "paddw %%xmm2,%%xmm3 \n" // near+far - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) - - // below line - "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) - "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) - "movdqa %%xmm3,%%xmm5 \n" - "movdqa %%xmm1,%%xmm4 \n" - "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) - "paddw %%xmm1,%%xmm4 \n" // near+far - "paddw %%xmm3,%%xmm5 \n" // near+far - "paddw %%xmm1,%%xmm1 \n" // 2*near - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,(%1) \n" - - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,0x10(%1) \n" - - "movdqa %%xmm1,%%xmm4 \n" - "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) - "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm1 \n" // ^ div by 16 - "movdqu %%xmm1,(%1,%4,2) \n" - - "movdqa %%xmm3,%%xmm4 \n" - "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,0x10(%1,%4,2) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packssdw %%xmm1,%%xmm0 \n" - "pshufd $0b11011000,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - "paddd %%xmm0,%%xmm2 \n" // near+far (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 2(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) - "paddd %%xmm2,%%xmm4 \n" // near+far (lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packssdw %%xmm0,%%xmm4 \n" - "pshufd $0b11011000,%%xmm4,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packssdw %%xmm2,%%xmm5 \n" - "pshufd $0b11011000,%%xmm4,%%xmm4 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n" - "vmovdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 1(%0,%3),%%xmm4 \n" - "punpcklwd %%xmm1,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm4 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm4,%%xmm3 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 -void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vbroadcastf128 %3,%%ymm3 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF - "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 -void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrlw $15,%%ymm6,%%ymm6 \n" - "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vbroadcastf128 %5,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF - "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) - - "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF - "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" - "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" - "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) - "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) - - // ymm0 ymm1 - // ymm2 ymm3 - - "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 -void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "vbroadcastf128 %3,%%ymm5 \n" - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) - "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) - - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 - - "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 - "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 - "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 - - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far - "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,32(%1) \n" - - "lea 0x20(%0),%0 \n" - "lea 0x40(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 -void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) - - "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) - - "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) - "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 - "vmovdqu %%ymm0,(%1) \n" // store above - - "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) - "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) - "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 - "vmovdqu %%ymm0,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 -void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $31,%%ymm4,%%ymm4 \n" - "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) - - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - - "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) - - "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) - "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) - - "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" - "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrld $31,%%ymm6,%%ymm6 \n" - "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) - - "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) - "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) - "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) - "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) - "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) - "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) - "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) - - "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" - "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" - "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -// Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#ifdef HAS_SCALEADDROW_AVX2 -// Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SCALEADDROW_AVX2 - -// Constant for making pixels signed to avoid pmaddubsw -// saturation. -static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Constant for making pixels unsigned and adding .5 for rounding. -static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; - -// Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - intptr_t x0, x1, temp_pixel; - asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + - // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 -#if defined(__x86_64__) - "+rm"(dst_width) // %5 -#else - "+m"(dst_width) // %5 -#endif - : "rm"(x), // %6 - "rm"(dx), // %7 -#if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 -#else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 -#endif - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile(LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - intptr_t src_stepx_x4 = (intptr_t)(src_stepx); - intptr_t src_stepx_x12; - (void)src_stride; - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} - -// Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - intptr_t src_stepx_x4 = (intptr_t)(src_stepx); - intptr_t src_stepx_x12; - intptr_t row1 = (intptr_t)(src_stride); - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} - -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" - - LABELALIGN - "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile(LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static const uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static const uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); - - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN - "99: \n" // clang-format error. - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_X86(int num, int div) { - asm volatile( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx"); - return num; -} - -// Divide num - 1 by div - 1 and return as 16.16 fixed point result. -int FixedDiv1_X86(int num, int div) { - asm volatile( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx"); - return num; -} - -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 -// Shuffle table for splitting UV into upper and lower part of register. -static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, - 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; -static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, - 6u, 14u, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80}; - -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5, %%xmm5 \n" // zero - "movdqa %4,%%xmm1 \n" // split shuffler - "movdqa %5,%%xmm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 8 UV row 0 - "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 - "lea 0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv - "pshufb %%xmm1,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add - "pmaddubsw %%xmm4,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" // vertical add - "psrlw $0x1,%%xmm0 \n" // round - "pavgw %%xmm5,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" // merge uv - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" // 4 UV - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 - -#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 -void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero - "vbroadcastf128 %4,%%ymm1 \n" // split shuffler - "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 - "lea 0x20(%0),%0 \n" - "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv - "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv - "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" // 8 UV - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 - -static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, - 3, 1, 3, 1, 1, 3, 1, 3}; - -#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 2(%0,%3),%%xmm4 \n" - "punpcklbw %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm1,%%xmm3 \n" - "punpckldq %%xmm1,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 - -void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vbroadcastf128 %3,%%ymm3 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" - "vmovdqu 2(%0),%%xmm1 \n" - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 uv to 16 uv - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 -void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrlw $15,%%ymm6,%%ymm6 \n" - "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vbroadcastf128 %5,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" - "vmovdqu 2(%0),%%xmm1 \n" - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) - - "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF - "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" - "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" - "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) - "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) - - // ymm0 ymm1 - // ymm2 ymm3 - - "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 uv to 16 uv - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 -void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) - "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packusdw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 -void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 4(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) - "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) - "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packusdw %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packusdw %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 -void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $31,%%ymm4,%%ymm4 \n" - "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - - "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) - - "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) - "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) - - "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 -void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrld $31,%%ymm6,%%ymm6 \n" - "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) - - "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) - "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) - "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) - "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) - "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) - "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) - - "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#endif // defined(__x86_64__) || defined(__i386__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_mmi.cc b/thirdparty/libyuv/source/scale_mmi.cc deleted file mode 100644 index 1226ef3..0000000 --- a/thirdparty/libyuv/source/scale_mmi.cc +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyARGB -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -// CPU agnostic row functions -void ScaleRowDown2_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlh %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest0, dest1; - - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "and %[dest0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "and %[dest1], %[src1], %[mask] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - - "psrlh %[src0], %[src0], %[shift] \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - "packushb %[dest1], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - - uint64_t s0, s1, t0, t1; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift0 = 0x2ULL; - const uint64_t shift1 = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest0], %[s0], %[s1] \n\t" - "paddh %[dest0], %[dest0], %[t0] \n\t" - "paddh %[dest0], %[dest0], %[t1] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift0] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest1], %[s0], %[s1] \n\t" - "paddh %[dest1], %[dest1], %[t0] \n\t" - "paddh %[dest1], %[dest1], %[t1] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift0] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" - "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" - "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - - uint64_t s0, s_hi, s_lo; - uint64_t t0, t_hi, t_lo; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shfit = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" - - "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" - - "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), - [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) - : "memory"); -} - -void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x10ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - - "packsswh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - - "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" - "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" - - "pavgh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - - uint64_t s0, s1, s_hi, s_lo; - uint64_t t0, t1, t_hi, t_lo; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0000000200000002ULL; - const uint64_t mask = 0x0000ffff0000ffffULL; - const uint64_t shift0 = 0x10ULL; - const uint64_t shift1 = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest0], %[s0], %[s1] \n\t" - "paddw %[dest0], %[dest0], %[t0] \n\t" - "paddw %[dest0], %[dest0], %[t1] \n\t" - "paddw %[dest0], %[dest0], %[ph] \n\t" - "psrlw %[dest0], %[dest0], %[shift1] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest1], %[s0], %[s1] \n\t" - "paddw %[dest1], %[dest1], %[t0] \n\t" - "paddw %[dest1], %[dest1], %[t1] \n\t" - "paddw %[dest1], %[dest1], %[ph] \n\t" - "psrlw %[dest1], %[dest1], %[shift1] \n\t" - - "packsswh %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), - [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t shift = 0x10ULL; - const uint64_t mask = 0x000000ff000000ffULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_lo], %[src0], %[src1] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_hi], %[src0], %[src1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift), [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" - "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [mask] "f"(mask) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ - "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ - "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ - "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ - "paddh " #reg ", " #reg ", %[ph] \n\t" \ - "psrlh " #reg ", " #reg ", %[shift] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box */ -void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* src0_ptr = src_ptr; - const uint8_t* src1_ptr = src_ptr + src_stride; - const uint8_t* src2_ptr = src_ptr + src_stride * 2; - const uint8_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x0001000100010001ULL; - const uint64_t ph = 0x0008000800080008ULL; - const uint64_t shift = 0x4ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) - - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ - "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ - "paddw %[dest], %[dest_hi], %[dest] \n\t" \ - "paddw %[dest], %[dest], %[ph] \n\t" \ - "psraw %[dest], %[dest], %[shift] \n\t" \ - "and " #reg ", %[dest], %[mask1] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box_16 */ -void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src0_ptr = src_ptr; - const uint16_t* src1_ptr = src_ptr + src_stride; - const uint16_t* src2_ptr = src_ptr + src_stride * 2; - const uint16_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x00000000ffffffffULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 0x04ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) - "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" - "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - - "punpcklhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddush %[dest0], %[dest0], %[src_lo] \n\t" - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddush %[dest1], %[dest1], %[src_hi] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleAddRow_16_MMI(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklhw %[src_lo], %[src], %[mask] \n\t" - "punpckhhw %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddw %[dest0], %[dest0], %[src_lo] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddw %[dest1], %[dest1], %[src_hi] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), - [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* src0_ptr = src_argb; - const uint8_t* src1_ptr = src_argb + src_stride; - - uint64_t src0, src1, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shift = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift] \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), - [ph] "f"(ph) - : "memory"); -} - -// Scales a single row of pixels using point sampling. -void ScaleARGBCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - const uint32_t* src_tmp; - - uint64_t dest, offset; - - const uint64_t shift0 = 16; - const uint64_t shift1 = 2; - - __asm__ volatile( - "1: \n\t" - "srav %[offset], %[x], %[shift0] \n\t" - "sllv %[offset], %[offset], %[shift1] \n\t" - "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" - "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" - "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[x], %[x], %[dx] \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - uint64_t src, dest0, dest1; - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklwd %[dest0], %[src], %[src] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest1], %[src], %[src] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVBaseTest.TestFixedDiv */ -int FixedDiv_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); - - return quotient; -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ -int FixedDiv1_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - const int val1 = 1; - const int64_t val11 = 0x00010001ULL; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "dsub %[num], %[num], %[val11] \n\t" - "dsub %[div], %[div], %[val1] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), - [shift] "r"(shift)); - - return quotient; -} - -// Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src2_ptr = src_ptr + src_stride; - - uint64_t src0, src1; - uint64_t dest, dest04, dest15, dest26, dest37; - uint64_t tmp0, tmp1, tmp2, tmp3; - - const uint64_t mask0 = 0x0003000900030009ULL; - const uint64_t mask1 = 0x0001000300010003ULL; - const uint64_t mask2 = 0x0009000300090003ULL; - const uint64_t mask3 = 0x0003000100030001ULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 4; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" - "pmaddhw %[dest04], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest04], %[dest04], %[dest] \n\t" - "paddw %[dest04], %[dest04], %[ph] \n\t" - "psrlw %[dest04], %[dest04], %[shift] \n\t" - - "pmaddhw %[dest15], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest15], %[dest15], %[dest] \n\t" - "paddw %[dest15], %[dest15], %[ph] \n\t" - "psrlw %[dest15], %[dest15], %[shift] \n\t" - - "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" - "pmaddhw %[dest26], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest26], %[dest26], %[dest] \n\t" - "paddw %[dest26], %[dest26], %[ph] \n\t" - "psrlw %[dest26], %[dest26], %[shift] \n\t" - - "pmaddhw %[dest37], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest37], %[dest37], %[dest] \n\t" - "paddw %[dest37], %[dest37], %[ph] \n\t" - "psrlw %[dest37], %[dest37], %[shift] \n\t" - - /* tmp0 = ( 00 04 02 06 ) */ - "packsswh %[tmp0], %[dest04], %[dest26] \n\t" - /* tmp1 = ( 01 05 03 07 ) */ - "packsswh %[tmp1], %[dest15], %[dest37] \n\t" - - /* tmp2 = ( 00 01 04 05 )*/ - "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" - /* tmp3 = ( 02 03 06 07 )*/ - "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" - - /* ( 00 01 02 03 ) */ - "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - /* ( 04 05 06 07 ) */ - "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), - [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) - : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) - : "memory"); -} - -void ScaleRowDown34_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - uint64_t src[2]; - uint64_t tmp[2]; - __asm__ volatile ( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "and %[tmp1], %[src0], %[mask1] \n\t" - "psrlw %[tmp0], %[src0], %[rmov] \n\t" - "psllw %[tmp0], %[tmp0], %[lmov1] \n\t" - "or %[src0], %[tmp0], %[tmp1] \n\t" - "punpckhwd %[tmp0], %[src0], %[src0] \n\t" - "psllw %[tmp1], %[tmp0], %[rmov] \n\t" - "or %[src0], %[src0], %[tmp1] \n\t" - "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t" - "pextrh %[tmp0], %[tmp0], %[zero] \n\t" - "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t" - "pextrh %[tmp0], %[src1], %[zero] \n\t" - "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t" - - "punpckhwd %[tmp0], %[src1], %[src1] \n\t" - "pextrh %[tmp1], %[tmp0], %[zero] \n\t" - "psrlw %[src1], %[src1], %[rmov] \n\t" - "psllw %[tmp1], %[tmp1], %[rmov8] \n\t" - "or %[src1], %[src1], %[tmp1] \n\t" - "and %[tmp0], %[tmp0], %[mask2] \n\t" - "or %[src1], %[src1], %[tmp0] \n\t" - - "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "bnez %[width], 1b \n\t" - - : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]), - [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1]) - : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst), - [lmov]"f"(0xc), [rmov]"f"(0x18), - [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8), - [zero]"f"(0x0), [mask2]"f"(0xff000000), - [width]"r"(dst_width), [lmov1]"f"(0x10) - : "memory" - ); -} -// clang-format on - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_msa.cc b/thirdparty/libyuv/source/scale_msa.cc deleted file mode 100644 index 482a521..0000000 --- a/thirdparty/libyuv/source/scale_msa.cc +++ /dev/null @@ -1,949 +0,0 @@ -/* - * Copyright 2016 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "libyuv/scale_row.h" - -// This module is for GCC MSA -#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#include "libyuv/macros_msa.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ - { \ - out0[0] = srcp[indx0[0]]; \ - out0[1] = srcp[indx0[1]]; \ - out0[2] = srcp[indx0[2]]; \ - out0[3] = srcp[indx0[3]]; \ - } - -void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - v16u8 src0, src1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - ST_UB(dst0, dst_argb); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - v16u8 src0, src1, vec0, vec1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); - vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); - ST_UB(dst0, dst_argb); - src_argb += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - int x; - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; - v8u16 reg0, reg1, reg2, reg3; - v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; - - for (x = 0; x < dst_width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); - vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); - vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); - reg0 = __msa_hadd_u_h(vec0, vec0); - reg1 = __msa_hadd_u_h(vec1, vec1); - reg2 = __msa_hadd_u_h(vec2, vec2); - reg3 = __msa_hadd_u_h(vec3, vec3); - reg0 += reg2; - reg1 += reg3; - reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); - reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_argb); - s += 32; - t += 32; - dst_argb += 16; - } -} - -void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int32_t src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - int32_t stepx = src_stepx * 4; - int32_t data0, data1, data2, data3; - (void)src_stride; - - for (x = 0; x < dst_width; x += 4) { - data0 = LW(src_argb); - data1 = LW(src_argb + stepx); - data2 = LW(src_argb + stepx * 2); - data3 = LW(src_argb + stepx * 3); - SW(data0, dst_argb); - SW(data1, dst_argb + 4); - SW(data2, dst_argb + 8); - SW(data3, dst_argb + 12); - src_argb += stepx * 4; - dst_argb += 16; - } -} - -void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - int x; - const uint8_t* nxt_argb = src_argb + src_stride; - int32_t stepx = src_stepx * 4; - int64_t data0, data1, data2, data3; - v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; - v16u8 vec0, vec1, vec2, vec3; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 dst0; - - for (x = 0; x < dst_width; x += 4) { - data0 = LD(src_argb); - data1 = LD(src_argb + stepx); - data2 = LD(src_argb + stepx * 2); - data3 = LD(src_argb + stepx * 3); - src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); - src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); - src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); - src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); - data0 = LD(nxt_argb); - data1 = LD(nxt_argb + stepx); - data2 = LD(nxt_argb + stepx * 2); - data3 = LD(nxt_argb + stepx * 3); - src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); - src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); - src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); - src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); - vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - reg0 = __msa_hadd_u_h(vec0, vec0); - reg1 = __msa_hadd_u_h(vec1, vec1); - reg2 = __msa_hadd_u_h(vec2, vec2); - reg3 = __msa_hadd_u_h(vec3, vec3); - reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); - reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); - reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); - reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); - reg4 += reg6; - reg5 += reg7; - reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); - reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - ST_UB(dst0, dst_argb); - src_argb += stepx * 4; - nxt_argb += stepx * 4; - dst_argb += 16; - } -} - -void ScaleRowDown2_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - ST_UB2(dst0, dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; - (void)src_stride; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); - vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); - dst0 = __msa_aver_u_b(vec1, vec0); - dst1 = __msa_aver_u_b(vec3, vec2); - ST_UB2(dst0, dst1, dst, 16); - src_ptr += 64; - dst += 32; - } -} - -void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; - v8u16 vec0, vec1, vec2, vec3; - - for (x = 0; x < dst_width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = __msa_hadd_u_h(src0, src0); - vec1 = __msa_hadd_u_h(src1, src1); - vec2 = __msa_hadd_u_h(src2, src2); - vec3 = __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); - vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); - vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); - vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - ST_UB2(dst0, dst1, dst, 16); - s += 64; - t += 64; - dst += 32; - } -} - -void ScaleRowDown4_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - v16u8 src0, src1, src2, src3, vec0, vec1, dst0; - (void)src_stride; - - for (x = 0; x < dst_width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); - dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst); - src_ptr += 64; - dst += 16; - } -} - -void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - const uint8_t* s = src_ptr; - const uint8_t* t0 = s + src_stride; - const uint8_t* t1 = s + src_stride * 2; - const uint8_t* t2 = s + src_stride * 3; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; - v8u16 vec0, vec1, vec2, vec3; - v4u32 reg0, reg1, reg2, reg3; - - for (x = 0; x < dst_width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); - vec0 = __msa_hadd_u_h(src0, src0); - vec1 = __msa_hadd_u_h(src1, src1); - vec2 = __msa_hadd_u_h(src2, src2); - vec3 = __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); - vec0 += __msa_hadd_u_h(src0, src0); - vec1 += __msa_hadd_u_h(src1, src1); - vec2 += __msa_hadd_u_h(src2, src2); - vec3 += __msa_hadd_u_h(src3, src3); - vec0 += __msa_hadd_u_h(src4, src4); - vec1 += __msa_hadd_u_h(src5, src5); - vec2 += __msa_hadd_u_h(src6, src6); - vec3 += __msa_hadd_u_h(src7, src7); - reg0 = __msa_hadd_u_w(vec0, vec0); - reg1 = __msa_hadd_u_w(vec1, vec1); - reg2 = __msa_hadd_u_w(vec2, vec2); - reg3 = __msa_hadd_u_w(vec3, vec3); - reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); - reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); - reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); - reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); - vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst); - s += 64; - t0 += 64; - t1 += 64; - t2 += 64; - dst += 16; - } -} - -void ScaleRowDown38_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x, width; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, vec0; - v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; - (void)src_stride; - - assert(dst_width % 3 == 0); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); - dst0 = __msa_copy_u_d((v2i64)vec0, 0); - dst1 = __msa_copy_u_w((v4i32)vec0, 2); - SD(dst0, dst); - SW(dst1, dst + 8); - src_ptr += 32; - dst += 12; - } -} - -void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, width; - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, src2, src3, out; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; - v8i16 zero = {0}; - v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; - v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; - v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); - v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); - vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); - vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); - vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - tmp0 = __msa_hadd_u_w(vec4, vec4); - tmp1 = __msa_hadd_u_w(vec5, vec5); - tmp2 = __msa_hadd_u_w(vec6, vec6); - tmp3 = __msa_hadd_u_w(vec7, vec7); - tmp4 = __msa_hadd_u_w(vec0, vec0); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - tmp0 = __msa_hadd_u_w(vec0, vec0); - tmp1 = __msa_hadd_u_w(vec1, vec1); - tmp0 *= const_0x2AAA; - tmp1 *= const_0x2AAA; - tmp4 *= const_0x4000; - tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); - tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); - tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); - out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); - dst0 = __msa_copy_u_d((v2i64)out, 0); - dst1 = __msa_copy_u_w((v4i32)out, 2); - SD(dst0, dst_ptr); - SW(dst1, dst_ptr + 8); - s += 32; - t += 32; - dst_ptr += 12; - } -} - -void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - int x, width; - const uint8_t* s = src_ptr; - const uint8_t* t0 = s + src_stride; - const uint8_t* t1 = s + src_stride * 2; - uint64_t dst0; - uint32_t dst1; - v16u8 src0, src1, src2, src3, src4, src5, out; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; - v8u16 zero = {0}; - v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; - v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; - v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); - v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); - - assert((dst_width % 3 == 0) && (dst_width > 0)); - width = dst_width / 3; - - for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); - src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); - src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); - vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); - vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); - vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); - vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); - vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); - vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); - vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); - vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); - vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); - vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); - vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); - vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); - tmp0 = __msa_hadd_u_w(vec4, vec4); - tmp1 = __msa_hadd_u_w(vec5, vec5); - tmp2 = __msa_hadd_u_w(vec6, vec6); - tmp3 = __msa_hadd_u_w(vec7, vec7); - tmp4 = __msa_hadd_u_w(vec0, vec0); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - tmp0 = __msa_hadd_u_w(vec0, vec0); - tmp1 = __msa_hadd_u_w(vec1, vec1); - tmp0 *= const_0x1C71; - tmp1 *= const_0x1C71; - tmp4 *= const_0x2AAA; - tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); - tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); - tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); - vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); - out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); - dst0 = __msa_copy_u_d((v2i64)out, 0); - dst1 = __msa_copy_u_w((v4i32)out, 2); - SD(dst0, dst_ptr); - SW(dst1, dst_ptr + 8); - s += 32; - t0 += 32; - t1 += 32; - dst_ptr += 12; - } -} - -void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - int x; - v16u8 src0; - v8u16 dst0, dst1; - v16i8 zero = {0}; - - assert(src_width > 0); - - for (x = 0; x < src_width; x += 16) { - src0 = LD_UB(src_ptr); - dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); - dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); - dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); - dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); - ST_UH2(dst0, dst1, dst_ptr, 8); - src_ptr += 16; - dst_ptr += 16; - } -} - -void ScaleFilterCols_MSA(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int j; - v4i32 vec_x = __msa_fill_w(x); - v4i32 vec_dx = __msa_fill_w(dx); - v4i32 vec_const = {0, 1, 2, 3}; - v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; - v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8u16 reg0, reg1; - v16u8 dst0; - v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); - v4i32 const_0x40 = __msa_fill_w(0x40); - - vec0 = vec_dx * vec_const; - vec1 = vec_dx * 4; - vec_x += vec0; - - for (j = 0; j < dst_width - 1; j += 16) { - vec2 = vec_x >> 16; - vec6 = vec_x & const_0xFFFF; - vec_x += vec1; - vec3 = vec_x >> 16; - vec7 = vec_x & const_0xFFFF; - vec_x += vec1; - vec4 = vec_x >> 16; - vec8 = vec_x & const_0xFFFF; - vec_x += vec1; - vec5 = vec_x >> 16; - vec9 = vec_x & const_0xFFFF; - vec_x += vec1; - vec6 >>= 9; - vec7 >>= 9; - vec8 >>= 9; - vec9 >>= 9; - LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); - LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); - LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); - LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); - vec2 += 1; - vec3 += 1; - vec4 += 1; - vec5 += 1; - LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); - LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); - LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); - LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); - tmp4 -= tmp0; - tmp5 -= tmp1; - tmp6 -= tmp2; - tmp7 -= tmp3; - tmp4 *= vec6; - tmp5 *= vec7; - tmp6 *= vec8; - tmp7 *= vec9; - tmp4 += const_0x40; - tmp5 += const_0x40; - tmp6 += const_0x40; - tmp7 += const_0x40; - tmp4 >>= 7; - tmp5 >>= 7; - tmp6 >>= 7; - tmp7 >>= 7; - tmp0 += tmp4; - tmp1 += tmp5; - tmp2 += tmp6; - tmp3 += tmp7; - reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); - reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - __msa_st_b(dst0, dst_ptr, 0); - dst_ptr += 16; - } -} - -void ScaleARGBCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - int j; - v4i32 x_vec = __msa_fill_w(x); - v4i32 dx_vec = __msa_fill_w(dx); - v4i32 const_vec = {0, 1, 2, 3}; - v4i32 vec0, vec1, vec2; - v4i32 dst0; - - vec0 = dx_vec * const_vec; - vec1 = dx_vec * 4; - x_vec += vec0; - - for (j = 0; j < dst_width; j += 4) { - vec2 = x_vec >> 16; - x_vec += vec1; - LOAD_INDEXED_DATA(src, vec2, dst0); - __msa_st_w(dst0, dst, 0); - dst += 4; - } -} - -void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - int j; - v4u32 src0, src1, src2, src3; - v4u32 vec0, vec1, vec2, vec3; - v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v16u8 mult0, mult1, mult2, mult3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 dst0, dst1; - v4u32 vec_x = (v4u32)__msa_fill_w(x); - v4u32 vec_dx = (v4u32)__msa_fill_w(dx); - v4u32 vec_const = {0, 1, 2, 3}; - v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); - - vec0 = vec_dx * vec_const; - vec1 = vec_dx * 4; - vec_x += vec0; - - for (j = 0; j < dst_width - 1; j += 8) { - vec2 = vec_x >> 16; - reg0 = (v16u8)(vec_x >> 9); - vec_x += vec1; - vec3 = vec_x >> 16; - reg1 = (v16u8)(vec_x >> 9); - vec_x += vec1; - reg0 = reg0 & const_0x7f; - reg1 = reg1 & const_0x7f; - reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); - reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); - reg2 = reg0 ^ const_0x7f; - reg3 = reg1 ^ const_0x7f; - mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); - mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); - mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); - mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); - LOAD_INDEXED_DATA(src, vec2, src0); - LOAD_INDEXED_DATA(src, vec3, src1); - vec2 += 1; - vec3 += 1; - LOAD_INDEXED_DATA(src, vec2, src2); - LOAD_INDEXED_DATA(src, vec3, src3); - reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); - reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); - reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); - reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); - tmp0 = __msa_dotp_u_h(reg4, mult0); - tmp1 = __msa_dotp_u_h(reg5, mult1); - tmp2 = __msa_dotp_u_h(reg6, mult2); - tmp3 = __msa_dotp_u_h(reg7, mult3); - tmp0 >>= 7; - tmp1 >>= 7; - tmp2 >>= 7; - tmp3 >>= 7; - dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - __msa_st_b(dst0, dst_argb, 0); - __msa_st_b(dst1, dst_argb, 16); - dst_argb += 32; - } -} - -void ScaleRowDown34_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - int x; - (void)src_stride; - v16u8 src0, src1, src2, src3; - v16u8 vec0, vec1, vec2; - v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; - v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; - v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, - 21, 23, 24, 25, 27, 28, 29, 31}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); - __msa_st_b((v16i8)vec0, dst, 0); - __msa_st_b((v16i8)vec1, dst, 16); - __msa_st_b((v16i8)vec2, dst, 32); - src_ptr += 64; - dst += 48; - } -} - -void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 vec6, vec7, vec8, vec9, vec10, vec11; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5; - v8i16 reg6, reg7, reg8, reg9, reg10, reg11; - v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; - v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; - v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; - v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, - 16, 17, 17, 18, 18, 19, 20, 21}; - v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; - v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; - v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; - v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); - vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); - vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); - vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); - vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); - vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); - vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); - vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); - reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); - reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); - reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); - reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); - reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); - reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); - reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); - reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); - reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); - reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); - reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); - reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); - reg0 = __msa_srar_h(reg0, shft0); - reg1 = __msa_srar_h(reg1, shft1); - reg2 = __msa_srar_h(reg2, shft2); - reg3 = __msa_srar_h(reg3, shft0); - reg4 = __msa_srar_h(reg4, shft1); - reg5 = __msa_srar_h(reg5, shft2); - reg6 = __msa_srar_h(reg6, shft0); - reg7 = __msa_srar_h(reg7, shft1); - reg8 = __msa_srar_h(reg8, shft2); - reg9 = __msa_srar_h(reg9, shft0); - reg10 = __msa_srar_h(reg10, shft1); - reg11 = __msa_srar_h(reg11, shft2); - reg0 = reg0 * 3 + reg6; - reg1 = reg1 * 3 + reg7; - reg2 = reg2 * 3 + reg8; - reg3 = reg3 * 3 + reg9; - reg4 = reg4 * 3 + reg10; - reg5 = reg5 * 3 + reg11; - reg0 = __msa_srari_h(reg0, 2); - reg1 = __msa_srari_h(reg1, 2); - reg2 = __msa_srari_h(reg2, 2); - reg3 = __msa_srari_h(reg3, 2); - reg4 = __msa_srari_h(reg4, 2); - reg5 = __msa_srari_h(reg5, 2); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - __msa_st_b((v16i8)dst0, d, 0); - __msa_st_b((v16i8)dst1, d, 16); - __msa_st_b((v16i8)dst2, d, 32); - s += 64; - t += 64; - d += 48; - } -} - -void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* d, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - int x; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 vec6, vec7, vec8, vec9, vec10, vec11; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5; - v8i16 reg6, reg7, reg8, reg9, reg10, reg11; - v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; - v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; - v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; - v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, - 16, 17, 17, 18, 18, 19, 20, 21}; - v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; - v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; - v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; - v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; - - assert((dst_width % 3 == 0) && (dst_width > 0)); - - for (x = 0; x < dst_width; x += 48) { - src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); - src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); - src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); - src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); - src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); - src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); - src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); - src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); - vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); - vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); - vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); - vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); - vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); - vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); - vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); - vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); - vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); - vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); - vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); - vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); - reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); - reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); - reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); - reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); - reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); - reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); - reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); - reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); - reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); - reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); - reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); - reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); - reg0 = __msa_srar_h(reg0, shft0); - reg1 = __msa_srar_h(reg1, shft1); - reg2 = __msa_srar_h(reg2, shft2); - reg3 = __msa_srar_h(reg3, shft0); - reg4 = __msa_srar_h(reg4, shft1); - reg5 = __msa_srar_h(reg5, shft2); - reg6 = __msa_srar_h(reg6, shft0); - reg7 = __msa_srar_h(reg7, shft1); - reg8 = __msa_srar_h(reg8, shft2); - reg9 = __msa_srar_h(reg9, shft0); - reg10 = __msa_srar_h(reg10, shft1); - reg11 = __msa_srar_h(reg11, shft2); - reg0 += reg6; - reg1 += reg7; - reg2 += reg8; - reg3 += reg9; - reg4 += reg10; - reg5 += reg11; - reg0 = __msa_srari_h(reg0, 1); - reg1 = __msa_srari_h(reg1, 1); - reg2 = __msa_srari_h(reg2, 1); - reg3 = __msa_srari_h(reg3, 1); - reg4 = __msa_srari_h(reg4, 1); - reg5 = __msa_srari_h(reg5, 1); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); - dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); - __msa_st_b((v16i8)dst0, d, 0); - __msa_st_b((v16i8)dst1, d, 16); - __msa_st_b((v16i8)dst2, d, 32); - s += 64; - t += 64; - d += 48; - } -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/thirdparty/libyuv/source/scale_neon.cc b/thirdparty/libyuv/source/scale_neon.cc deleted file mode 100644 index 6a0d6e1..0000000 --- a/thirdparty/libyuv/source/scale_neon.cc +++ /dev/null @@ -1,1494 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) - -// NEON downscalers with interpolation. -// Provided by Fritz Koenig - -// Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - // load even pixels into q0, odd into q1 - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} - -// Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); -} - -// Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + - // row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and - // pack - "vrshrn.u16 d1, q1, #2 \n" - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleRowDown4_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc"); -} - -void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile( - "1: \n" - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - "vld1.8 {q1}, [%3]! \n" - "vld1.8 {q2}, [%4]! \n" - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 - : - : "q0", "q1", "q2", "q3", "memory", "cc"); -} - -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc"); -} - -void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" - - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" - - // (3 * line_0 + line_1 + 2) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" - - "vst3.8 {d0, d1, d2}, [%1]! \n" - - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", - "cc"); -} - -void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" - - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); -} - -#define HAS_SCALEROWDOWN38_NEON -static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, - 22, 24, 27, 30, 0, 0, 0, 0}; -static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, - 18, 6, 14, 19, 0, 0, 0, 0}; -static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12}; -static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18}; - -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "vld1.8 {q3}, [%3] \n" - "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); -} - -// 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - - asm volatile( - "vld1.16 {q13}, [%5] \n" - "vld1.8 {q14}, [%6] \n" - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", - "cc"); -} - -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" - - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - - // combine source lines - "vadd.u16 q1, q3 \n" - - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" - - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - - "vst1.8 {d3}, [%1]! \n" - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); -} - -void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 1; - asm volatile( - "vmov.u8 d30, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 01234567 - "vld1.8 {d5}, [%3]! \n" // 12345678 - - "vmovl.u8 q0, d4 \n" // 01234567 (16b) - "vmovl.u8 q1, d5 \n" // 12345678 (16b) - "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) - "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) - - "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.8 {d0, d1}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 1; - const uint8_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "vmov.u16 q15, #3 \n" - "vmov.u8 d28, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 01234567 - "vld1.8 {d5}, [%5]! \n" // 12345678 - - "vmovl.u8 q0, d4 \n" // 01234567 (16b) - "vmovl.u8 q1, d5 \n" // 12345678 (16b) - "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) - "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) - - "vld1.8 {d8}, [%1]! \n" - "vld1.8 {d9}, [%6]! \n" - - "vmovl.u8 q2, d8 \n" - "vmovl.u8 q3, d9 \n" - "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) - "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) - - // e o - // q1 q0 - // q3 q2 - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - // e o - // q5 q4 - // q1 q0 - - "vrshrn.u16 d2, q1, #4 \n" // 2, even - "vrshrn.u16 d3, q0, #4 \n" // 2, odd - "vrshrn.u16 d0, q5, #4 \n" // 1, even - "vrshrn.u16 d1, q4, #4 \n" // 1, odd - - "vst2.8 {d0, d1}, [%2]! \n" // store - "vst2.8 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", - "q15" // Clobber List - ); -} - -void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile( - "vmov.u16 q15, #3 \n" - - "1: \n" - "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) - - "vmovq q2, q0 \n" - "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) - "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - - "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "vmov.u16 q15, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) - - "vmovq q2, q0 \n" - "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) - "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - - "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) - "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) - - "vmovq q4, q2 \n" - "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) - "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - "vrshr.u16 q2, q1, #4 \n" // 2, even - "vrshr.u16 q3, q0, #4 \n" // 2, odd - "vrshr.u16 q0, q5, #4 \n" // 1, even - "vrshr.u16 q1, q4, #4 \n" // 1, odd - - "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store - "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store - "subs %4, %4, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", - "q15" // Clobber List - ); -} - -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile( - "vmov.u16 d31, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) - "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) - - "vmovl.u16 q2, d0 \n" // 0123 (32b) - "vmovl.u16 q3, d1 \n" // 4567 (32b) - "vmovl.u16 q4, d2 \n" // 1234 (32b) - "vmovl.u16 q5, d3 \n" // 5678 (32b) - - "vmlal.u16 q2, d2, d31 \n" - "vmlal.u16 q3, d3, d31 \n" - "vmlal.u16 q4, d0, d31 \n" - "vmlal.u16 q5, d1, d31 \n" - - "vrshrn.u32 d0, q4, #2 \n" - "vrshrn.u32 d1, q5, #2 \n" - "vrshrn.u32 d2, q2, #2 \n" - "vrshrn.u32 d3, q3, #2 \n" - - "vst2.16 {q0, q1}, [%1]! \n" // store - "subs %2, %2, #16 \n" // 8 sample -> 16 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "vmov.u16 d31, #3 \n" - "vmov.u32 q14, #3 \n" - - "1: \n" - "vld1.16 {d0}, [%0]! \n" // 0123 (16b) - "vld1.16 {d1}, [%5]! \n" // 1234 (16b) - "vmovl.u16 q2, d0 \n" // 0123 (32b) - "vmovl.u16 q3, d1 \n" // 1234 (32b) - "vmlal.u16 q2, d1, d31 \n" - "vmlal.u16 q3, d0, d31 \n" - - "vld1.16 {d0}, [%1]! \n" // 0123 (16b) - "vld1.16 {d1}, [%6]! \n" // 1234 (16b) - "vmovl.u16 q4, d0 \n" // 0123 (32b) - "vmovl.u16 q5, d1 \n" // 1234 (32b) - "vmlal.u16 q4, d1, d31 \n" - "vmlal.u16 q5, d0, d31 \n" - - "vmovq q0, q4 \n" - "vmovq q1, q5 \n" - "vmla.u32 q4, q2, q14 \n" - "vmla.u32 q5, q3, q14 \n" - "vmla.u32 q2, q0, q14 \n" - "vmla.u32 q3, q1, q14 \n" - - "vrshrn.u32 d1, q4, #4 \n" - "vrshrn.u32 d0, q5, #4 \n" - "vrshrn.u32 d3, q2, #4 \n" - "vrshrn.u32 d2, q3, #4 \n" - - "vst2.16 {d0, d1}, [%2]! \n" // store - "vst2.16 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #8 \n" // 4 sample -> 8 sample - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", - "d31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 2; - asm volatile( - "vmov.u8 d30, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) - "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) - - "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) - "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) - - "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.16 {d0, d1}, [%1]! \n" // store - "subs %2, %2, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 2; - const uint8_t* src_temp1 = src_ptr1 + 2; - - asm volatile( - "vmov.u16 q15, #3 \n" - "vmov.u8 d28, #3 \n" - - "1: \n" - "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) - "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) - - "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) - "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) - - "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) - "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) - - "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) - "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) - "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) - "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) - - // e o - // q1 q0 - // q3 q2 - - "vmovq q4, q2 \n" - "vmovq q5, q3 \n" - "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) - "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) - "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) - "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) - - // e o - // q5 q4 - // q1 q0 - - "vrshrn.u16 d2, q1, #4 \n" // 2, even - "vrshrn.u16 d3, q0, #4 \n" // 2, odd - "vrshrn.u16 d0, q5, #4 \n" // 1, even - "vrshrn.u16 d1, q4, #4 \n" // 1, odd - - "vst2.16 {d0, d1}, [%2]! \n" // store - "vst2.16 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", - "q15" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 2; - asm volatile( - "vmov.u16 d30, #3 \n" - - "1: \n" - "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) - "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) - - "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) - "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) - "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) - "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) - "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) - "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) - "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) - - "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) - "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) - - "vst2.32 {d0, d1}, [%1]! \n" // store - "vst2.32 {d2, d3}, [%1]! \n" // store - "subs %2, %2, #8 \n" // 4 uv -> 8 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_temp) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", - "d30" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 2; - const uint16_t* src_temp1 = src_ptr1 + 2; - - asm volatile( - "vmov.u16 d30, #3 \n" - "vmov.u32 q14, #3 \n" - - "1: \n" - "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) - "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) - "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) - "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) - "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) - - "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) - "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) - "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) - "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) - "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) - "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) - - "vmovq q0, q4 \n" - "vmovq q1, q5 \n" - "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) - "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) - "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) - "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) - - "vrshrn.u32 d1, q4, #4 \n" // 1, odd - "vrshrn.u32 d0, q5, #4 \n" // 1, even - "vrshrn.u32 d3, q2, #4 \n" // 2, odd - "vrshrn.u32 d2, q3, #4 \n" // 2, even - - "vst2.32 {d0, d1}, [%2]! \n" // store - "vst2.32 {d2, d3}, [%3]! \n" // store - "subs %4, %4, #4 \n" // 2 uv -> 4 uv - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_ptr1), // %3 - "+r"(dst_width), // %4 - "+r"(src_temp), // %5 - "+r"(src_temp1) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", - "d30" // Clobber List - ); -} - -// Add a row of bytes to a row of shorts. Used for box filter. -// Reads 16 bytes and accumulates to 16 shorts at a time. -void ScaleAddRow_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile( - "1: \n" - "vld1.16 {q1, q2}, [%1] \n" // load accumulator - "vld1.8 {q0}, [%0]! \n" // load 16 bytes - "vaddw.u8 q2, q2, d1 \n" // add - "vaddw.u8 q1, q1, d0 \n" - "vst1.16 {q1, q2}, [%1]! \n" // store accumulator - "subs %2, %2, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2" // Clobber List - ); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" - -// The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8_t)((int)(a) + -// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) - -void ScaleFilterCols_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_ptr; - asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q3, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q1, q1, q0 \n" - // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "vadd.s32 q2, q1, q3 \n" - "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" - LOAD2_DATA8_LANE(0) - LOAD2_DATA8_LANE(1) - LOAD2_DATA8_LANE(2) - LOAD2_DATA8_LANE(3) - LOAD2_DATA8_LANE(4) - LOAD2_DATA8_LANE(5) - LOAD2_DATA8_LANE(6) - LOAD2_DATA8_LANE(7) - "vmov q10, q1 \n" - "vmov q11, q2 \n" - "vuzp.16 q10, q11 \n" - "vmovl.u8 q8, d6 \n" - "vmovl.u8 q9, d7 \n" - "vsubl.s16 q11, d18, d16 \n" - "vsubl.s16 q12, d19, d17 \n" - "vmovl.u16 q13, d20 \n" - "vmovl.u16 q10, d21 \n" - "vmul.s32 q11, q11, q13 \n" - "vmul.s32 q12, q12, q10 \n" - "vrshrn.s32 d18, q11, #16 \n" - "vrshrn.s32 d19, q12, #16 \n" - "vadd.s16 q8, q8, q9 \n" - "vmovn.s16 d6, q8 \n" - - "vst1.8 {d6}, [%0]! \n" // store pixels - "vadd.s32 q1, q1, q0 \n" - "vadd.s32 q2, q2, q0 \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13" - ); -} - -#undef LOAD2_DATA8_LANE - -// 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - asm volatile( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" - - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" - - // Blend 25 / 75. - "25: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "vld1.8 {q0}, [%1]! \n" - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" - - // Blend 75 / 25. - "75: \n" - "vld1.8 {q1}, [%1]! \n" - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" - - "99: \n" - "vst1.8 {d1[7]}, [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 - : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); -} - -void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vmov q2, q1 \n" // load next 8 ARGB - "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! -// 4a: 3e04 subs r6, #4 -// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! -// 50: ef64 21f4 vorr q9, q10, q10 -// 54: f942 038d vst2.32 {d16-d19}, [r2]! -// 58: d1f5 bne.n 46 - -void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %2, %2, #8 \n" // 8 processed per loop - "vrhadd.u8 q0, q0, q1 \n" // rounding half add - "vrhadd.u8 q1, q2, q3 \n" // rounding half add - "vst2.32 {q0, q1}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "mov r12, %3, lsl #2 \n" - "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - asm volatile( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "vld1.32 {" #dn "[" #n "]}, [%6] \n" - -void ScaleARGBCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int tmp; - const uint8_t* src_tmp = src_argb; - asm volatile( - "1: \n" - // clang-format off - LOAD1_DATA32_LANE(d0, 0) - LOAD1_DATA32_LANE(d0, 1) - LOAD1_DATA32_LANE(d1, 0) - LOAD1_DATA32_LANE(d1, 1) - LOAD1_DATA32_LANE(d2, 0) - LOAD1_DATA32_LANE(d2, 1) - LOAD1_DATA32_LANE(d3, 0) - LOAD1_DATA32_LANE(d3, 1) - // clang-format on - "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1"); -} - -#undef LOAD1_DATA32_LANE - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" - -void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_argb; - asm volatile ( - "vdup.32 q0, %3 \n" // x - "vdup.32 q1, %4 \n" // dx - "vld1.32 {q2}, [%5] \n" // 0 1 2 3 - "vshl.i32 q9, q1, #2 \n" // 4 * dx - "vmul.s32 q1, q1, q2 \n" - "vmov.i8 q3, #0x7f \n" // 0x7F - "vmov.i16 q15, #0x7f \n" // 0x7F - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "vadd.s32 q8, q1, q0 \n" - "1: \n" - // d0, d1: a - // d2, d3: b - LOAD2_DATA32_LANE(d0, d2, 0) - LOAD2_DATA32_LANE(d0, d2, 1) - LOAD2_DATA32_LANE(d1, d3, 0) - LOAD2_DATA32_LANE(d1, d3, 1) - "vshrn.i32 d22, q8, #9 \n" - "vand.16 d22, d22, d30 \n" - "vdup.8 d24, d22[0] \n" - "vdup.8 d25, d22[2] \n" - "vdup.8 d26, d22[4] \n" - "vdup.8 d27, d22[6] \n" - "vext.8 d4, d24, d25, #4 \n" - "vext.8 d5, d26, d27, #4 \n" // f - "veor.8 q10, q2, q3 \n" // 0x7f ^ f - "vmull.u8 q11, d0, d20 \n" - "vmull.u8 q12, d1, d21 \n" - "vmull.u8 q13, d2, d4 \n" - "vmull.u8 q14, d3, d5 \n" - "vadd.i16 q11, q11, q13 \n" - "vadd.i16 q12, q12, q14 \n" - "vshrn.i16 d0, q11, #7 \n" - "vshrn.i16 d1, q12, #7 \n" - - "vst1.32 {d0, d1}, [%0]! \n" // store pixels - "vadd.s32 q8, q8, q9 \n" - "subs %2, %2, #4 \n" // 4 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -#undef LOAD2_DATA32_LANE - -void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. - "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. - "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV - "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV - "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes - "vrshrn.u16 d1, q1, #2 \n" - "vst2.8 {d0, d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q8", "q9"); -} - -// Reads 4 pixels at a time. -void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, // pixel step - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src1_ptr = src_ptr + src_stepx * 2; - const uint8_t* src2_ptr = src_ptr + src_stepx * 4; - const uint8_t* src3_ptr = src_ptr + src_stepx * 6; - (void)src_stride; - asm volatile( - "1: \n" - "vld1.16 {d0[0]}, [%0], %6 \n" - "vld1.16 {d0[1]}, [%1], %6 \n" - "vld1.16 {d0[2]}, [%2], %6 \n" - "vld1.16 {d0[3]}, [%3], %6 \n" - "subs %5, %5, #4 \n" // 4 pixels per loop. - "vst1.8 {d0}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src1_ptr), // %1 - "+r"(src2_ptr), // %2 - "+r"(src3_ptr), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_width) // %5 - : "r"(src_stepx * 8) // %6 - : "memory", "cc", "d0"); -} - -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_neon64.cc b/thirdparty/libyuv/source/scale_neon64.cc deleted file mode 100644 index 8656fec..0000000 --- a/thirdparty/libyuv/source/scale_neon64.cc +++ /dev/null @@ -1,1634 +0,0 @@ -/* - * Copyright 2014 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/scale.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon armv8 64 bit. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -// Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); -} - -// Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); -} - -// Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddlp v1.8h, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" - "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ScaleRowDown4_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc"); -} - -void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - const uint8_t* src_ptr2 = src_ptr + src_stride * 2; - const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.8h, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "uadalp v0.8h, v2.16b \n" - "prfm pldl1keep, [%3, 448] \n" - "uadalp v0.8h, v3.16b \n" - "prfm pldl1keep, [%4, 448] \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "v0", "v1", "v2", "v3", "memory", "cc"); -} - -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc"); -} - -void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" - - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // (3 * line_0 + line_1 + 2) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" - - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", - "v19", "v20", "memory", "cc"); -} - -void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" - "prfm pldl1keep, [%3, 448] \n" - - // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" - - // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" - - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); -} - -static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, - 22, 24, 27, 30, 0, 0, 0, 0}; -static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, - 34, 6, 22, 35, 0, 0, 0, 0}; -static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12}; -static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18}; - -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "v0", "v1", "v2", "v3", "memory", "cc"); -} - -// 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - ptrdiff_t tmp_src_stride = src_stride; - - asm volatile( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" - - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" - - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" - - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" - "prfm pldl1keep, [%3, 448] \n" - - // Align for table lookup, vtbl requires registers to be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", - "memory", "cc"); -} - -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - // TODO(fbarchard): use src_stride directly for clang 3.5+. - ptrdiff_t tmp_src_stride = src_stride; - asm volatile( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" - - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" - - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" - - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" - - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" - - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" - - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" - "prfm pldl1keep, [%2, 448] \n" - - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" - - // Align for table lookup, vtbl requires registers to - // be adjacent - - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", - "v19", "v30", "v31", "memory", "cc"); -} - -void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 1; - asm volatile( - "movi v31.8b, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 01234567 - "ldr d1, [%1], #8 \n" // 12345678 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) - - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) - - "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 1; - const uint8_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "movi v31.8b, #3 \n" - "movi v30.8h, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 01234567 - "ldr d1, [%2], #8 \n" // 12345678 - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) - "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) - "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) - "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) - - "mov v0.8h, v4.8h \n" - "mov v1.8h, v5.8h \n" - "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) - - "rshrn v2.8b, v2.8h, #4 \n" // 2, odd - "rshrn v1.8b, v3.8h, #4 \n" // 2, even - "rshrn v4.8b, v4.8h, #4 \n" // 1, odd - "rshrn v3.8b, v5.8h, #4 \n" // 1, even - - "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 - "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 - "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "mov v2.8h, v0.8h \n" - "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) - "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) - - "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) - "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "mov v0.8h, v2.8h \n" - "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) - "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) - - "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) - "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "mov v0.8h, v4.8h \n" - "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) - "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) - - "mov v0.8h, v4.8h \n" - "mov v1.8h, v5.8h \n" - "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) - - "urshr v2.8h, v2.8h, #4 \n" // 2, odd - "urshr v1.8h, v3.8h, #4 \n" // 2, even - "urshr v4.8h, v4.8h, #4 \n" // 1, odd - "urshr v3.8h, v5.8h, #4 \n" // 1, even - - "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 - "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 - - "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v31" // Clobber List - ); -} - -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 1; - asm volatile( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) - "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) - "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) - - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) - "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) - - "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far - "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) - "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far - "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) - - "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List - ); -} - -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 1; - const uint16_t* src_temp1 = src_ptr1 + 1; - - asm volatile( - "movi v31.4h, #3 \n" - "movi v30.4s, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 0123 (16b) - "ldr d1, [%2], #8 \n" // 1234 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" // 0123 (16b) - "ldr d1, [%3], #8 \n" // 1234 (16b) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) - "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) - "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) - "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) - "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) - "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) - "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) - - "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far - "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far - - "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 - "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 - - "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src_temp = src_ptr + 2; - asm volatile( - "movi v31.8b, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" // 00112233 (1u1v) - "ldr d1, [%1], #8 \n" // 11223344 (1u1v) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) - "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) - - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) - - "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store - "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint8_t* dst_ptr1 = dst_ptr + dst_stride; - const uint8_t* src_temp = src_ptr + 2; - const uint8_t* src_temp1 = src_ptr1 + 2; - - asm volatile( - "movi v31.8b, #3 \n" - "movi v30.8h, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" - "ldr d1, [%2], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.8h, v0.8b, #0 \n" - "ushll v3.8h, v1.8b, #0 \n" - "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) - "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - - "ushll v4.8h, v0.8b, #0 \n" - "ushll v5.8h, v1.8b, #0 \n" - "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) - "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) - - "mov v0.8h, v4.8h \n" - "mov v1.8h, v5.8h \n" - "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) - "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) - "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) - "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) - - "rshrn v2.8b, v2.8h, #4 \n" // 2, odd - "rshrn v1.8b, v3.8h, #4 \n" // 2, even - "rshrn v4.8b, v4.8h, #4 \n" // 1, odd - "rshrn v3.8b, v5.8h, #4 \n" // 1, even - - "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 - "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 - "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - const uint16_t* src_temp = src_ptr + 2; - asm volatile( - "movi v31.8h, #3 \n" - - "1: \n" - "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) - "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - - "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) - "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) - - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) - "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) - "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) - - "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) - "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) - "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) - - "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store - "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store - "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_temp), // %1 - "+r"(dst_ptr), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v31" // Clobber List - ); -} - -void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t* src_ptr1 = src_ptr + src_stride; - uint16_t* dst_ptr1 = dst_ptr + dst_stride; - const uint16_t* src_temp = src_ptr + 2; - const uint16_t* src_temp1 = src_ptr1 + 2; - - asm volatile( - "movi v31.4h, #3 \n" - "movi v30.4s, #3 \n" - - "1: \n" - "ldr d0, [%0], #8 \n" - "ldr d1, [%2], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) - "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) - - "ldr d0, [%1], #8 \n" - "ldr d1, [%3], #8 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) - "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) - "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) - "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) - - "mov v0.16b, v4.16b \n" - "mov v1.16b, v5.16b \n" - "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) - "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) - "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) - "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) - - "rshrn v1.4h, v2.4s, #4 \n" // 2, odd - "rshrn v0.4h, v3.4s, #4 \n" // 2, even - "rshrn v3.4h, v4.4s, #4 \n" // 1, odd - "rshrn v2.4h, v5.4s, #4 \n" // 1, even - - "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 - "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 - "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_ptr1), // %1 - "+r"(src_temp), // %2 - "+r"(src_temp1), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_ptr1), // %5 - "+r"(dst_width) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", - "v31" // Clobber List - ); -} - -// Add a row of bytes to a row of shorts. Used for box filter. -// Reads 16 bytes and accumulates to 16 shorts at a time. -void ScaleAddRow_NEON(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile( - "1: \n" - "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator - "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes - "uaddw2 v2.8h, v2.8h, v0.16b \n" // add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddw v1.8h, v1.8h, v0.8b \n" - "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator - "subs %w2, %w2, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2" // Clobber List - ); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - "ld2 {v4.b, v5.b}[" #n "], [%6] \n" - -// The NEON version mimics this formula (from row_common.cc): -// #define BLENDER(a, b, f) (uint8_t)((int)(a) + -// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) - -void ScaleFilterCols_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_ptr; - int64_t x64 = (int64_t)x; // NOLINT - int64_t dx64 = (int64_t)dx; // NOLINT - asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v3.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v1.4s, v1.4s, v0.4s \n" - // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx - "add v2.4s, v1.4s, v3.4s \n" - "shl v0.4s, v3.4s, #1 \n" // 8 * dx - "1: \n" - LOAD2_DATA8_LANE(0) - LOAD2_DATA8_LANE(1) - LOAD2_DATA8_LANE(2) - LOAD2_DATA8_LANE(3) - LOAD2_DATA8_LANE(4) - LOAD2_DATA8_LANE(5) - LOAD2_DATA8_LANE(6) - LOAD2_DATA8_LANE(7) - "mov v6.16b, v1.16b \n" - "mov v7.16b, v2.16b \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "ushll v4.8h, v4.8b, #0 \n" - "ushll v5.8h, v5.8b, #0 \n" - "ssubl v16.4s, v5.4h, v4.4h \n" - "ssubl2 v17.4s, v5.8h, v4.8h \n" - "ushll v7.4s, v6.4h, #0 \n" - "ushll2 v6.4s, v6.8h, #0 \n" - "mul v16.4s, v16.4s, v7.4s \n" - "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" - "add v4.8h, v4.8h, v6.8h \n" - "xtn v4.8b, v4.8h \n" - - "st1 {v4.8b}, [%0], #8 \n" // store pixels - "add v1.4s, v1.4s, v0.4s \n" - "add v2.4s, v2.4s, v0.4s \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", - "v4", "v5", "v6", "v7", "v16", "v17" - ); -} - -#undef LOAD2_DATA8_LANE - -// 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction) { - int y_fraction = 256 - source_y_fraction; - asm volatile( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" - - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" - - // Blend 25 / 75. - "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" - - // Blend 50 / 50. - "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" - - // Blend 75 / 25. - "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "urhadd v0.16b, v0.16b, v1.16b \n" - "prfm pldl1keep, [%2, 448] \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" - - "99: \n" - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction), // %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); -} - -void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "mov v2.16b, v3.16b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 - "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - - "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "urhadd v1.16b, v2.16b, v3.16b \n" - "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64_t)(src_stepx * 4)) // %3 - : "memory", "cc", "v0"); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -// TODO(Yang Zhang): Might be worth another optimization pass in future. -// It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - asm volatile( - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "prfm pldl1keep, [%1, 448] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64_t)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); -} - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "ld1 {" #vn ".s}[" #n "], [%6] \n" - -void ScaleARGBCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint8_t* src_tmp = src_argb; - int64_t x64 = (int64_t)x; // NOLINT - int64_t dx64 = (int64_t)dx; // NOLINT - int64_t tmp64; - asm volatile( - "1: \n" - // clang-format off - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - // clang-format on - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1"); -} - -#undef LOAD1_DATA32_LANE - -// TODO(Yang Zhang): Investigate less load instructions for -// the x/dx stepping -#define LOAD2_DATA32_LANE(vn1, vn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" - -void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - int dx_offset[4] = {0, 1, 2, 3}; - int* tmp = dx_offset; - const uint8_t* src_tmp = src_argb; - int64_t x64 = (int64_t)x; // NOLINT - int64_t dx64 = (int64_t)dx; // NOLINT - asm volatile ( - "dup v0.4s, %w3 \n" // x - "dup v1.4s, %w4 \n" // dx - "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 - "shl v6.4s, v1.4s, #2 \n" // 4 * dx - "mul v1.4s, v1.4s, v2.4s \n" - "movi v3.16b, #0x7f \n" // 0x7F - "movi v4.8h, #0x7f \n" // 0x7F - // x , x + 1 * dx, x + 2 * dx, x + 3 * dx - "add v5.4s, v1.4s, v0.4s \n" - "1: \n" - // d0, d1: a - // d2, d3: b - LOAD2_DATA32_LANE(v0, v1, 0) - LOAD2_DATA32_LANE(v0, v1, 1) - LOAD2_DATA32_LANE(v0, v1, 2) - LOAD2_DATA32_LANE(v0, v1, 3) - "shrn v2.4h, v5.4s, #9 \n" - "and v2.8b, v2.8b, v4.8b \n" - "dup v16.8b, v2.b[0] \n" - "dup v17.8b, v2.b[2] \n" - "dup v18.8b, v2.b[4] \n" - "dup v19.8b, v2.b[6] \n" - "ext v2.8b, v16.8b, v17.8b, #4 \n" - "ext v17.8b, v18.8b, v19.8b, #4 \n" - "ins v2.d[1], v17.d[0] \n" // f - "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f - "umull v16.8h, v0.8b, v7.8b \n" - "umull2 v17.8h, v0.16b, v7.16b \n" - "umull v18.8h, v1.8b, v2.8b \n" - "umull2 v19.8h, v1.16b, v2.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "add v16.8h, v16.8h, v18.8h \n" - "add v17.8h, v17.8h, v19.8h \n" - "shrn v0.8b, v16.8h, #7 \n" - "shrn2 v0.16b, v17.8h, #7 \n" - "st1 {v0.4s}, [%0], #16 \n" // store pixels - "add v5.4s, v5.4s, v6.4s \n" - "subs %w2, %w2, #4 \n" // 4 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "+r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v16", "v17", "v18", "v19" - ); -} - -#undef LOAD2_DATA32_LANE - -// Read 16x2 average down and write 8x1. -void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "1: \n" - "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #8 \n" // 8 processed per loop - "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent - "uaddlp v1.4s, v1.8h \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent - "uadalp v1.4s, v3.8h \n" - "prfm pldl1keep, [%1, 448] \n" - "rshrn v0.4h, v0.4s, #2 \n" // round and pack - "rshrn2 v0.8h, v1.4s, #2 \n" - "st1 {v0.8h}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// Read 8x2 upsample with filtering and write 16x1. -// Actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - asm volatile( - "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "movi v0.8h, #9 \n" // constants - "movi v1.4s, #3 \n" - - "1: \n" - "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 - "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 - "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row - "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 - "subs %w3, %w3, #16 \n" // 16 dst pixels per loop - "umull v16.4s, v3.4h, v0.4h \n" - "umull2 v7.4s, v3.8h, v0.8h \n" - "umull v18.4s, v4.4h, v0.4h \n" - "umull2 v17.4s, v4.8h, v0.8h \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "uaddw v16.4s, v16.4s, v6.4h \n" - "uaddl2 v19.4s, v6.8h, v3.8h \n" - "uaddl v3.4s, v6.4h, v3.4h \n" - "uaddw2 v6.4s, v7.4s, v6.8h \n" - "uaddl2 v7.4s, v5.8h, v4.8h \n" - "uaddl v4.4s, v5.4h, v4.4h \n" - "uaddw v18.4s, v18.4s, v5.4h \n" - "prfm pldl1keep, [%1, 448] \n" - "mla v16.4s, v4.4s, v1.4s \n" - "mla v18.4s, v3.4s, v1.4s \n" - "mla v6.4s, v7.4s, v1.4s \n" - "uaddw2 v4.4s, v17.4s, v5.8h \n" - "uqrshrn v16.4h, v16.4s, #4 \n" - "mla v4.4s, v19.4s, v1.4s \n" - "uqrshrn2 v16.8h, v6.4s, #4 \n" - "uqrshrn v17.4h, v18.4s, #4 \n" - "uqrshrn2 v17.8h, v4.4s, #4 \n" - "st2 {v16.8h-v17.8h}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : "r"(2LL), // %4 - "r"(14LL) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", - "v19" // Clobber List - ); -} - -void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - asm volatile( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. - "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 - "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "rshrn v0.8b, v0.8h, #2 \n" // round and pack - "prfm pldl1keep, [%1, 448] \n" - "rshrn v1.8b, v1.8h, #2 \n" - "st2 {v0.8b,v1.8b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v16", "v17"); -} - -// Reads 4 pixels at a time. -void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, // pixel step - uint8_t* dst_ptr, - int dst_width) { - const uint8_t* src1_ptr = src_ptr + src_stepx * 2; - const uint8_t* src2_ptr = src_ptr + src_stepx * 4; - const uint8_t* src3_ptr = src_ptr + src_stepx * 6; - (void)src_stride; - asm volatile( - "1: \n" - "ld1 {v0.h}[0], [%0], %6 \n" - "ld1 {v1.h}[0], [%1], %6 \n" - "ld1 {v2.h}[0], [%2], %6 \n" - "ld1 {v3.h}[0], [%3], %6 \n" - "subs %w5, %w5, #4 \n" // 4 pixels per loop. - "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src1_ptr), // %1 - "+r"(src2_ptr), // %2 - "+r"(src3_ptr), // %3 - "+r"(dst_ptr), // %4 - "+r"(dst_width) // %5 - : "r"((int64_t)(src_stepx * 8)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3"); -} - -#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_uv.cc b/thirdparty/libyuv/source/scale_uv.cc deleted file mode 100644 index d9a3144..0000000 --- a/thirdparty/libyuv/source/scale_uv.cc +++ /dev/null @@ -1,1197 +0,0 @@ -/* - * Copyright 2020 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyUV -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Macros to enable specialized scalers - -#ifndef HAS_SCALEUVDOWN2 -#define HAS_SCALEUVDOWN2 1 -#endif -#ifndef HAS_SCALEUVDOWN4BOX -#define HAS_SCALEUVDOWN4BOX 1 -#endif -#ifndef HAS_SCALEUVDOWNEVEN -#define HAS_SCALEUVDOWNEVEN 1 -#endif -#ifndef HAS_SCALEUVBILINEARDOWN -#define HAS_SCALEUVBILINEARDOWN 1 -#endif -#ifndef HAS_SCALEUVBILINEARUP -#define HAS_SCALEUVBILINEARUP 1 -#endif -#ifndef HAS_UVCOPY -#define HAS_UVCOPY 1 -#endif -#ifndef HAS_SCALEPLANEVERTICAL -#define HAS_SCALEPLANEVERTICAL 1 -#endif - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// ScaleUV, 1/2 -// This is an optimized version for scaling down a UV to 1/2 of -// its original size. -#if HAS_SCALEUVDOWN2 -static void ScaleUVDown2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - int row_stride = src_stride * (dy >> 16); - void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, - uint8_t* dst_uv, int dst_width) = - filtering == kFilterNone - ? ScaleUVRowDown2_C - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C - : ScaleUVRowDown2Box_C); - (void)src_width; - (void)src_height; - (void)dx; - assert(dx == 65536 * 2); // Test scale factor of 2. - assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. - // Advance to odd row, even column. - if (filtering == kFilterBilinear) { - src_uv += (y >> 16) * src_stride + (x >> 16) * 2; - } else { - src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2; - } - -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && filtering) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && filtering) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) - if (TestCpuFlag(kCpuHasNEON) && filtering) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; - } - } -#endif - -// This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_SSSE3 - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 - : ScaleUVRowDown2Box_Any_SSSE3); - if (IS_ALIGNED(dst_width, 2)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_SSSE3 - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 - : ScaleUVRowDown2Box_SSSE3); - } - } -#endif -// This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON - : ScaleUVRowDown2Box_Any_NEON); - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON - : ScaleUVRowDown2Box_NEON); - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_MMI - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI - : ScaleUVRowDown2Box_Any_MMI); - if (IS_ALIGNED(dst_width, 2)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_MMI - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI - : ScaleUVRowDown2Box_MMI); - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_MSA - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA - : ScaleUVRowDown2Box_Any_MSA); - if (IS_ALIGNED(dst_width, 2)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_MSA - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA - : ScaleUVRowDown2Box_MSA); - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (j = 0; j < dst_height; ++j) { - ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); - src_uv += row_stride; - dst_uv += dst_stride; - } -} -#endif // HAS_SCALEUVDOWN2 - -// ScaleUV, 1/4 -// This is an optimized version for scaling down a UV to 1/4 of -// its original size. -#if HAS_SCALEUVDOWN4BOX -static void ScaleUVDown4Box(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy) { - int j; - // Allocate 2 rows of UV. - const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); - int row_stride = src_stride * (dy >> 16); - void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, - uint8_t* dst_uv, int dst_width) = - ScaleUVRowDown2Box_C; - // Advance to odd row, even column. - src_uv += (y >> 16) * src_stride + (x >> 16) * 2; - (void)src_width; - (void)src_height; - (void)dx; - assert(dx == 65536 * 4); // Test scale factor of 4. - assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. - -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; - } - } -#endif - - for (j = 0; j < dst_height; ++j) { - ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); - ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, - dst_width * 2); - ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); - src_uv += row_stride; - dst_uv += dst_stride; - } - free_aligned_buffer_64(row); -} -#endif // HAS_SCALEUVDOWN4BOX - -// ScaleUV Even -// This is an optimized version for scaling down a UV to even -// multiple of its original size. -#if HAS_SCALEUVDOWNEVEN -static void ScaleUVDownEven(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - int col_step = dx >> 16; - int row_stride = (dy >> 16) * src_stride; - void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, - int src_step, uint8_t* dst_uv, int dst_width) = - filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; - (void)src_width; - (void)src_height; - assert(IS_ALIGNED(src_width, 2)); - assert(IS_ALIGNED(src_height, 2)); - src_uv += (y >> 16) * src_stride + (x >> 16) * 2; -#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 - : ScaleUVRowDownEven_Any_SSSE3; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && !filtering) { - ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; - } - } -#endif // TODO(fbarchard): Enable Box filter -#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON - : ScaleUVRowDownEven_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWNEVEN_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI; - if (IS_ALIGNED(dst_width, 2)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI; - } - } -#endif -#if defined(HAS_SCALEUVROWDOWNEVEN_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (j = 0; j < dst_height; ++j) { - ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); - src_uv += row_stride; - dst_uv += dst_stride; - } -} -#endif - -// Scale UV down with bilinear interpolation. -#if HAS_SCALEUVBILINEARDOWN -static void ScaleUVBilinearDown(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; - int64_t xlast = x + (int64_t)(dst_width - 1) * dx; - int64_t xl = (dx >= 0) ? x : xlast; - int64_t xr = (dx >= 0) ? xlast : x; - int clip_src_width; - xl = (xl >> 16) & ~3; // Left edge aligned. - xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. - xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. - if (xr > src_width) { - xr = src_width; - } - clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. - src_uv += xl * 2; - x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(clip_src_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(clip_src_width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEUVFILTERCOLS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVFilterCols = ScaleUVFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEUVFILTERCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVFilterCols_MSA; - } - } -#endif - // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. - // Allocate a row of UV. - { - align_buffer_64(row, clip_src_width * 2); - - const int max_y = (src_height - 1) << 16; - if (y > max_y) { - y = max_y; - } - for (j = 0; j < dst_height; ++j) { - int yi = y >> 16; - const uint8_t* src = src_uv + yi * src_stride; - if (filtering == kFilterLinear) { - ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(row, src, src_stride, clip_src_width, yf); - ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); - } - dst_uv += dst_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - } - free_aligned_buffer_64(row); - } -} -#endif - -// Scale UV up with bilinear interpolation. -#if HAS_SCALEUVBILINEARUP -static void ScaleUVBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy, - enum FilterMode filtering) { - int j; - void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, - int dst_width, int x, int dx) = - filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; - const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - InterpolateRow = InterpolateRow_Any_MMI; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_MMI; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif - if (src_width >= 32768) { - ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; - } -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; - } -#endif -#if defined(HAS_SCALEUVFILTERCOLS_NEON) - if (filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVFilterCols_NEON; - } - } -#endif -#if defined(HAS_SCALEUVFILTERCOLS_MSA) - if (filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; - if (IS_ALIGNED(dst_width, 16)) { - ScaleUVFilterCols = ScaleUVFilterCols_MSA; - } - } -#endif -#if defined(HAS_SCALEUVCOLS_SSSE3) - if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVCols_SSSE3; - } -#endif -#if defined(HAS_SCALEUVCOLS_NEON) - if (!filtering && TestCpuFlag(kCpuHasNEON)) { - ScaleUVFilterCols = ScaleUVCols_Any_NEON; - if (IS_ALIGNED(dst_width, 16)) { - ScaleUVFilterCols = ScaleUVCols_NEON; - } - } -#endif -#if defined(HAS_SCALEUVCOLS_MMI) - if (!filtering && TestCpuFlag(kCpuHasMMI)) { - ScaleUVFilterCols = ScaleUVCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleUVFilterCols = ScaleUVCols_MMI; - } - } -#endif -#if defined(HAS_SCALEUVCOLS_MSA) - if (!filtering && TestCpuFlag(kCpuHasMSA)) { - ScaleUVFilterCols = ScaleUVCols_Any_MSA; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVCols_MSA; - } - } -#endif - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleUVFilterCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; - } -#endif -#if defined(HAS_SCALEUVCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleUVFilterCols = ScaleUVColsUp2_MMI; - } -#endif - } - - if (y > max_y) { - y = max_y; - } - - { - int yi = y >> 16; - const uint8_t* src = src_uv + yi * src_stride; - - // Allocate 2 rows of UV. - const int kRowSize = (dst_width * 2 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); - - uint8_t* rowptr = row; - int rowstride = kRowSize; - int lasty = yi; - - ScaleUVFilterCols(rowptr, src, dst_width, x, dx); - if (src_height > 1) { - src += src_stride; - } - ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); - src += src_stride; - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - src = src_uv + yi * src_stride; - } - if (yi != lasty) { - ScaleUVFilterCols(rowptr, src, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - src += src_stride; - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); - } - dst_uv += dst_stride; - y += dy; - } - free_aligned_buffer_64(row); - } -} -#endif // HAS_SCALEUVBILINEARUP - -// Scale UV, horizontally up by 2 times. -// Uses linear filter horizontally, nearest vertically. -// This is an optimized version for scaling up a plane to 2 times of -// its original width, using linear interpolation. -// This is used to scale U and V planes of NV16 to NV24. -void ScaleUVLinearUp2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv) { - void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = - ScaleUVRowUp2_Linear_Any_C; - int i; - int y; - int dy; - - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_NEON - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; - } -#endif - - if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width); - dst_uv += dst_stride; - y += dy; - } - } -} - -// Scale plane, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// This is used to scale U and V planes of NV12 to NV24. -void ScaleUVBilinearUp2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_ptr, - uint8_t* dst_ptr) { - void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleUVRowUp2_Bilinear_Any_C; - int x; - - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON - if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; - } -#endif - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO(fbarchard): Test performance of writing one row of destination at a - // time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -// Scale 16 bit UV, horizontally up by 2 times. -// Uses linear filter horizontally, nearest vertically. -// This is an optimized version for scaling up a plane to 2 times of -// its original width, using linear interpolation. -// This is used to scale U and V planes of P210 to P410. -void ScaleUVLinearUp2_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_uv, - uint16_t* dst_uv) { - void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = - ScaleUVRowUp2_Linear_16_Any_C; - int i; - int y; - int dy; - - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON - if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; - } -#endif - - if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width); - dst_uv += dst_stride; - y += dy; - } - } -} - -// Scale 16 bit UV, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// This is used to scale U and V planes of P010 to P410. -void ScaleUVBilinearUp2_16(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint16_t* src_ptr, - uint16_t* dst_ptr) { - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleUVRowUp2_Bilinear_16_Any_C; - int x; - - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; - } -#endif - -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON - if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; - } -#endif - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO(fbarchard): Test performance of writing one row of destination at a - // time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -// Scale UV to/from any dimensions, without interpolation. -// Fixed point math is used for performance: The upper 16 bits -// of x and dx is the integer part of the source position and -// the lower 16 bits are the fixed decimal part. - -static void ScaleUVSimple(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t* src_uv, - uint8_t* dst_uv, - int x, - int dx, - int y, - int dy) { - int j; - void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, - int x, int dx) = - (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; - (void)src_height; -#if defined(HAS_SCALEUVCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVCols = ScaleUVCols_SSSE3; - } -#endif -#if defined(HAS_SCALEUVCOLS_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVCols = ScaleUVCols_Any_NEON; - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVCols = ScaleUVCols_NEON; - } - } -#endif -#if defined(HAS_SCALEUVCOLS_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - ScaleUVCols = ScaleUVCols_Any_MMI; - if (IS_ALIGNED(dst_width, 1)) { - ScaleUVCols = ScaleUVCols_MMI; - } - } -#endif -#if defined(HAS_SCALEUVCOLS_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - ScaleUVCols = ScaleUVCols_Any_MSA; - if (IS_ALIGNED(dst_width, 4)) { - ScaleUVCols = ScaleUVCols_MSA; - } - } -#endif - if (src_width * 2 == dst_width && x < 0x8000) { - ScaleUVCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVCols = ScaleUVColsUp2_SSSE3; - } -#endif -#if defined(HAS_SCALEUVCOLSUP2_MMI) - if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { - ScaleUVCols = ScaleUVColsUp2_MMI; - } -#endif - } - - for (j = 0; j < dst_height; ++j) { - ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx); - dst_uv += dst_stride; - y += dy; - } -} - -// Copy UV with optional flipping -#if HAS_UVCOPY -static int UVCopy(const uint8_t* src_UV, - int src_stride_uv, - uint8_t* dst_UV, - int dst_stride_uv, - int width, - int height) { - if (!src_UV || !dst_UV || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_UV = src_UV + (height - 1) * src_stride_uv; - src_stride_uv = -src_stride_uv; - } - - CopyPlane(src_UV, src_stride_uv, dst_UV, dst_stride_uv, width * 2, height); - return 0; -} - -static int UVCopy_16(const uint16_t* src_UV, - int src_stride_uv, - uint16_t* dst_UV, - int dst_stride_uv, - int width, - int height) { - if (!src_UV || !dst_UV || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_UV = src_UV + (height - 1) * src_stride_uv; - src_stride_uv = -src_stride_uv; - } - - CopyPlane_16(src_UV, src_stride_uv, dst_UV, dst_stride_uv, width * 2, height); - return 0; -} -#endif // HAS_UVCOPY - -// Scale a UV plane (from NV12) -// This function in turn calls a scaling function -// suitable for handling the desired resolutions. -static void ScaleUV(const uint8_t* src, - int src_stride, - int src_width, - int src_height, - uint8_t* dst, - int dst_stride, - int dst_width, - int dst_height, - int clip_x, - int clip_y, - int clip_width, - int clip_height, - enum FilterMode filtering) { - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - // UV does not support box filter yet, but allow the user to pass it. - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative src_height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * src_stride; - src_stride = -src_stride; - } - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - if (clip_x) { - int64_t clipf = (int64_t)(clip_x)*dx; - x += (clipf & 0xffff); - src += (clipf >> 16) * 2; - dst += clip_x * 2; - } - if (clip_y) { - int64_t clipf = (int64_t)(clip_y)*dy; - y += (clipf & 0xffff); - src += (clipf >> 16) * src_stride; - dst += clip_y * dst_stride; - } - - // Special case for integer step values. - if (((dx | dy) & 0xffff) == 0) { - if (!dx || !dy) { // 1 pixel wide and/or tall. - filtering = kFilterNone; - } else { - // Optimized even scale down. ie 2, 4, 6, 8, 10x. - if (!(dx & 0x10000) && !(dy & 0x10000)) { -#if HAS_SCALEUVDOWN2 - if (dx == 0x20000) { - // Optimized 1/2 downsample. - ScaleUVDown2(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } -#endif -#if HAS_SCALEUVDOWN4BOX - if (dx == 0x40000 && filtering == kFilterBox) { - // Optimized 1/4 box downsample. - ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy); - return; - } -#endif -#if HAS_SCALEUVDOWNEVEN - ScaleUVDownEven(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; -#endif - } - // Optimized odd scale down. ie 3, 5, 7, 9x. - if ((dx & 0x10000) && (dy & 0x10000)) { - filtering = kFilterNone; -#ifdef HAS_UVCOPY - if (dx == 0x10000 && dy == 0x10000) { - // Straight copy. - UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst, - dst_stride, clip_width, clip_height); - return; - } -#endif - } - } - } - // HAS_SCALEPLANEVERTICAL - if (dx == 0x10000 && (x & 0xffff) == 0) { - // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst, x, y, dy, 4, filtering); - return; - } - if (filtering && (dst_width + 1) / 2 == src_width) { - ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst); - return; - } - if ((clip_height + 1) / 2 == src_height && - (clip_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst); - return; - } -#if HAS_SCALEUVBILINEARUP - if (filtering && dy < 65536) { - ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } -#endif -#if HAS_SCALEUVBILINEARDOWN - if (filtering) { - ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, - filtering); - return; - } -#endif - ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, - dst_stride, src, dst, x, dx, y, dy); -} - -// Scale an UV image. -LIBYUV_API -int UVScale(const uint8_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint8_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || - src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { - return -1; - } - ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, - dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); - return 0; -} - -// Scale a 16 bit UV image. -// This function is currently incomplete, it can't handle all cases. -LIBYUV_API -int UVScale_16(const uint16_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint16_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int dy = 0; - - if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || - src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { - return -1; - } - - // UV does not support box filter yet, but allow the user to pass it. - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative src_height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src_uv = src_uv + (src_height - 1) * src_stride_uv; - src_stride_uv = -src_stride_uv; - } - src_width = Abs(src_width); - -#ifdef HAS_UVCOPY - if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { - if (dst_height == 1) { - UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv, - dst_uv, dst_stride_uv, dst_width, dst_height); - } else { - dy = src_height / dst_height; - UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy, - dst_uv, dst_stride_uv, dst_width, dst_height); - } - - return 0; - } -#endif - - if (filtering && (dst_width + 1) / 2 == src_width) { - ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, - src_stride_uv, dst_stride_uv, src_uv, dst_uv); - return 0; - } - - if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height, - src_stride_uv, dst_stride_uv, src_uv, dst_uv); - return 0; - } - - return -1; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/scale_win.cc b/thirdparty/libyuv/source/scale_win.cc deleted file mode 100644 index ea1f95c..0000000 --- a/thirdparty/libyuv/source/scale_win.cc +++ /dev/null @@ -1,1392 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) - -// Offsets for source bytes 0 to 9 -static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 0 to 10 -static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, - 8, 9, 9, 10, 10, 11, 12, 13}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; - -// Coefficients for source bytes 0 to 10 -static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; - -// Coefficients for source bytes 10 to 21 -static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; - -// Coefficients for source bytes 21 to 31 -static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; - -// Coefficients for source bytes 21 to 31 -static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; - -static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 0,1,2 -static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 3,4,5 -static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x3 and 2x3 -static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; - -// Arrange first value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; - -// Arrange second value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; - -// Arrange third value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x2 and 2x2 -static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; - -// Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add - paddw xmm1, xmm3 - psrlw xmm0, 1 - psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN2_AVX2 -// Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. - vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x1 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - vzeroupper - ret - } -} - -// For rounding, average = (sum + 2) / 4 -// becomes average((sum >> 1), 0) -// Blends 64x2 rectangle to 32x1. -__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b - vpsrlw ymm4, ymm4, 15 - vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add - vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 - vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 - vpavgw ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN2_AVX2 - -// Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - ret - } -} - -// Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - movdqa xmm5, xmm4 - packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 - - wloop: - movdqu xmm0, [eax] // average rows - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 - paddw xmm1, xmm3 - movdqu xmm2, [eax + esi * 2] - movdqu xmm3, [eax + esi * 2 + 16] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 - paddw xmm1, xmm3 - movdqu xmm2, [eax + edi] - movdqu xmm3, [eax + edi + 16] - lea eax, [eax + 32] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 - paddw xmm1, xmm3 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - pop edi - pop esi - ret - } -} - -#ifdef HAS_SCALEROWDOWN4_AVX2 -// Point samples 64 pixels to 16 pixels. -__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 - vpsrld ymm5, ymm5, 24 - vpslld ymm5, ymm5, 16 - - wloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - vzeroupper - ret - } -} - -// Blends 64x4 rectangle to 16x1. -__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 - vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 - vpackuswb ymm4, ymm4, ymm4 - - wloop: - vmovdqu ymm0, [eax] // average rows - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + esi] - vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + esi * 2] - vmovdqu ymm3, [eax + esi * 2 + 32] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 - vpaddw ymm1, ymm1, ymm3 - vmovdqu ymm2, [eax + edi] - vmovdqu ymm3, [eax + edi + 32] - lea eax, [eax + 64] - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 - vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_SCALEROWDOWN4_AVX2 - -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, xmmword ptr kShuf0 - movdqa xmm4, xmmword ptr kShuf1 - movdqa xmm5, xmmword ptr kShuf2 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 kRound34 - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} - -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, xmmword ptr kShuf38a - movdqa xmm5, xmmword ptr kShuf38b - - xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - sub ecx, 12 - jg xloop - - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAc - movdqa xmm3, xmmword ptr kShufAc3 - movdqa xmm4, xmmword ptr kScaleAc33 - pxor xmm5, xmm5 - - xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqu xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqu xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAb0 - movdqa xmm3, xmmword ptr kShufAb1 - movdqa xmm4, xmmword ptr kShufAb2 - movdqa xmm5, xmmword ptr kScaleAb2 - - xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 - movdqu xmm1, [eax + esi] - lea eax, [eax + 16] - pavgb xmm0, xmm1 - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} - -// Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - pxor xmm5, xmm5 - - // sum rows - xloop: - movdqu xmm3, [eax] // read 16 bytes - lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination - movdqu xmm1, [edx + 16] - movdqa xmm2, xmm3 - punpcklbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 16 - jg xloop - ret - } -} - -#ifdef HAS_SCALEADDROW_AVX2 -// Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - vpxor ymm5, ymm5, ymm5 - - // sum rows - xloop: - vmovdqu ymm3, [eax] // read 32 bytes - lea eax, [eax + 32] - vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck - vpunpcklbw ymm2, ymm3, ymm5 - vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words - vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination - vmovdqu [edx + 32], ymm1 - lea edx, [edx + 64] - sub ecx, 32 - jg xloop - - vzeroupper - ret - } -} -#endif // HAS_SCALEADDROW_AVX2 - -// Constant for making pixels signed to avoid pmaddubsw -// saturation. -static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Constant for making pixels unsigned and adding .5 for rounding. -static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; - -// Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 - psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. - movd ebx, xmm1 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit - paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits - movd ebx, xmm2 - mov [edi], bl - - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} - -// Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - ret - } -} - -// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} - -// Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop esi - ret - } -} - -// Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop ebx - ret - } -} - -// Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} - -// Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 // 4 pixels - jge xloop4 - - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - xloop99: - - pop esi - pop edi - ret - } -} - -// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. -// TODO(fbarchard): Port to Neon - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static const uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static const uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, xmmword ptr kShuffleColARGB - movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - xloop99: - - pop edi - pop esi - ret - } -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - idiv dword ptr [esp + 8] - ret - } -} - -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) int FixedDiv1_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - sub eax, 0x00010001 - sbb edx, 0 - sub ecx, 1 - idiv ecx - ret - } -} -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/source/test.sh b/thirdparty/libyuv/source/test.sh deleted file mode 100644 index 7f12c3c..0000000 --- a/thirdparty/libyuv/source/test.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -set -x - -function runbenchmark1 { - perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 - perf report | grep AVX -} - -runbenchmark1 ABGRToI420 -runbenchmark1 Android420ToI420 -runbenchmark1 ARGBToI420 -runbenchmark1 Convert16To8Plane -runbenchmark1 ConvertToARGB -runbenchmark1 ConvertToI420 -runbenchmark1 CopyPlane -runbenchmark1 H010ToAB30 -runbenchmark1 H010ToAR30 -runbenchmark1 HalfFloatPlane -runbenchmark1 I010ToAB30 -runbenchmark1 I010ToAR30 -runbenchmark1 I420Copy -runbenchmark1 I420Psnr -runbenchmark1 I420Scale -runbenchmark1 I420Ssim -runbenchmark1 I420ToARGB -runbenchmark1 I420ToNV12 -runbenchmark1 I420ToUYVY -runbenchmark1 I422ToI420 -runbenchmark1 InitCpuFlags -runbenchmark1 J420ToARGB -runbenchmark1 NV12ToARGB -runbenchmark1 NV12ToI420 -runbenchmark1 NV12ToI420Rotate -runbenchmark1 SetCpuFlags -runbenchmark1 YUY2ToI420 diff --git a/thirdparty/libyuv/source/video_common.cc b/thirdparty/libyuv/source/video_common.cc deleted file mode 100644 index 92384c0..0000000 --- a/thirdparty/libyuv/source/video_common.cc +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/video_common.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -struct FourCCAliasEntry { - uint32_t alias; - uint32_t canonical; -}; - -#define NUM_ALIASES 18 -static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { - {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, - {FOURCC_YU16, FOURCC_I422}, - {FOURCC_YU24, FOURCC_I444}, - {FOURCC_YUYV, FOURCC_YUY2}, - {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs - {FOURCC_HDYC, FOURCC_UYVY}, - {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 - {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. - {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, // deprecated. - {FOURCC_RGB3, FOURCC_RAW}, - {FOURCC_BGR3, FOURCC_24BG}, - {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB - {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB - {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 - {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 - {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 -}; -// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. -// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA - -LIBYUV_API -uint32_t CanonicalFourCC(uint32_t fourcc) { - int i; - for (i = 0; i < NUM_ALIASES; ++i) { - if (kFourCCAliases[i].alias == fourcc) { - return kFourCCAliases[i].canonical; - } - } - // Not an alias, so return it as-is. - return fourcc; -} - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/thirdparty/libyuv/util/Makefile b/thirdparty/libyuv/util/Makefile deleted file mode 100644 index 40e74b6..0000000 --- a/thirdparty/libyuv/util/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -psnr: psnr.cc ssim.cc psnr_main.cc -ifeq ($(CXX),icl) - $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc -else - $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all -endif - -# for MacOS -# /usr/local/bin/g++-7 -msse2 -O3 -fopenmp -Bstatic -o psnr psnr.cc ssim.cc psnr_main.cc diff --git a/thirdparty/libyuv/util/color.cc b/thirdparty/libyuv/util/color.cc deleted file mode 100644 index 8c3bbef..0000000 --- a/thirdparty/libyuv/util/color.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2021 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -// This utility computes values needed to generate yuvconstants based on -// white point values. -// The yuv formulas are tuned for 8 bit YUV channels. - -// For those MCs that can be represented as kr and kb: -// Full range -// float M[3][3] -// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}}; -// float B[3] -// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255}; -// Limited range -// float M[3][3] -// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}}; -// float B[3] -// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785}; - -// mc bt -// 1 bt.709 KR = 0.2126; KB = 0.0722 -// 4 fcc KR = 0.30; KB = 0.11 -// 6 bt.601 KR = 0.299; KB = 0.114 -// 7 SMPTE 240M KR = 0.212; KB = 0.087 -// 10 bt2020 KR = 0.2627; KB = 0.0593 - -// BT.709 full range YUV to RGB reference -// R = Y + V * 1.5748 -// G = Y - U * 0.18732 - V * 0.46812 -// B = Y + U * 1.8556 -// KR = 0.2126 -// KB = 0.0722 - -// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/ - -// // Y contribution to R,G,B. Scale and bias. -// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -// #define YB 32 /* 64 / 2 */ -// -// // U and V contributions to R,G,B. -// #define UB 113 /* round(1.77200 * 64) */ -// #define UG 22 /* round(0.34414 * 64) */ -// #define VG 46 /* round(0.71414 * 64) */ -// #define VR 90 /* round(1.40200 * 64) */ -// -// // Bias values to round, and subtract 128 from U and V. -// #define BB (-UB * 128 + YB) -// #define BG (UG * 128 + VG * 128 + YB) -// #define BR (-VR * 128 + YB) - -int round(float v) { - return (int)(v + 0.5); -} - -int main(int argc, const char* argv[]) { - if (argc < 2) { - printf("color kr kb\n"); - return -1; - } - float kr = atof(argv[1]); - float kb = atof(argv[2]); - float kg = 1 - kr - kb; - - float vr = 2 * (1 - kr); - float ug = 2 * ((1 - kb) * kb / kg); - float vg = 2 * ((1 - kr) * kr / kg); - float ub = 2 * (1 - kb); - - printf("Full range\n"); - printf("R = Y + V * %5f\n", vr); - printf("G = Y - U * %6f - V * %6f\n", ug, vg); - printf("B = Y + U * %5f\n", ub); - - printf("KR = %4f; ", kr); - printf("KB = %4f\n", kb); - // printf("KG = %4f\n", kg); - // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ - // #define YB 32 /* 64 / 2 */ - // - // // U and V contributions to R,G,B. - - printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); - printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); - printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg); - printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr); - - vr = 255.f / 224.f * 2 * (1 - kr); - ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg); - vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg); - ub = 255.f / 224.f * 2 * (1 - kb); - - printf("Limited range\n"); - printf("R = (Y - 16) * 1.164 + V * %5f\n", vr); - printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg); - printf("B = (Y - 16) * 1.164 + U * %5f\n", ub); - - // printf("KG = %4f\n", kg); - // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ - // #define YB 32 /* 64 / 2 */ - // - // // U and V contributions to R,G,B. - - printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); - printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); - printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg); - printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr); - - return 0; -} diff --git a/thirdparty/libyuv/util/compare.cc b/thirdparty/libyuv/util/compare.cc deleted file mode 100644 index a16613e..0000000 --- a/thirdparty/libyuv/util/compare.cc +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include - -#include "libyuv/basic_types.h" -#include "libyuv/compare.h" -#include "libyuv/version.h" - -int main(int argc, char** argv) { - if (argc < 1) { - printf("libyuv compare v%d\n", LIBYUV_VERSION); - printf("compare file1.yuv file2.yuv\n"); - return -1; - } - char* name1 = argv[1]; - char* name2 = (argc > 2) ? argv[2] : NULL; - FILE* fin1 = fopen(name1, "rb"); - FILE* fin2 = name2 ? fopen(name2, "rb") : NULL; - - const int kBlockSize = 32768; - uint8_t buf1[kBlockSize]; - uint8_t buf2[kBlockSize]; - uint32_t hash1 = 5381; - uint32_t hash2 = 5381; - uint64_t sum_square_err = 0; - uint64_t size_min = 0; - int amt1 = 0; - int amt2 = 0; - do { - amt1 = static_cast(fread(buf1, 1, kBlockSize, fin1)); - if (amt1 > 0) { - hash1 = libyuv::HashDjb2(buf1, amt1, hash1); - } - if (fin2) { - amt2 = static_cast(fread(buf2, 1, kBlockSize, fin2)); - if (amt2 > 0) { - hash2 = libyuv::HashDjb2(buf2, amt2, hash2); - } - int amt_min = (amt1 < amt2) ? amt1 : amt2; - size_min += amt_min; - sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min); - } - } while (amt1 > 0 || amt2 > 0); - - printf("hash1 %x", hash1); - if (fin2) { - printf(", hash2 %x", hash2); - double mse = - static_cast(sum_square_err) / static_cast(size_min); - printf(", mse %.2f", mse); - double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min); - printf(", psnr %.2f\n", psnr); - fclose(fin2); - } - fclose(fin1); -} diff --git a/thirdparty/libyuv/util/cpuid.c b/thirdparty/libyuv/util/cpuid.c deleted file mode 100644 index 46f9c1b..0000000 --- a/thirdparty/libyuv/util/cpuid.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -#include "libyuv/cpu_id.h" - -#ifdef __cplusplus -using namespace libyuv; -#endif - -int main(int argc, const char* argv[]) { - int cpu_flags = TestCpuFlag(-1); - int has_arm = TestCpuFlag(kCpuHasARM); - int has_mips = TestCpuFlag(kCpuHasMIPS); - int has_x86 = TestCpuFlag(kCpuHasX86); - (void)argc; - (void)argv; - -#if defined(__i386__) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_X64) - if (has_x86) { - int family, model, cpu_info[4]; - // Vendor ID: - // AuthenticAMD AMD processor - // CentaurHauls Centaur processor - // CyrixInstead Cyrix processor - // GenuineIntel Intel processor - // GenuineTMx86 Transmeta processor - // Geode by NSC National Semiconductor processor - // NexGenDriven NexGen processor - // RiseRiseRise Rise Technology processor - // SiS SiS SiS SiS processor - // UMC UMC UMC UMC processor - CpuId(0, 0, &cpu_info[0]); - cpu_info[0] = cpu_info[1]; // Reorder output - cpu_info[1] = cpu_info[3]; - cpu_info[3] = 0; - printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0])); - - // CPU Family and Model - // 3:0 - Stepping - // 7:4 - Model - // 11:8 - Family - // 13:12 - Processor Type - // 19:16 - Extended Model - // 27:20 - Extended Family - CpuId(1, 0, &cpu_info[0]); - family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); - model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); - printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, - model, model); - } -#endif - printf("Cpu Flags %x\n", cpu_flags); - printf("Has ARM %x\n", has_arm); - printf("Has MIPS %x\n", has_mips); - printf("Has X86 %x\n", has_x86); - if (has_arm) { - int has_neon = TestCpuFlag(kCpuHasNEON); - printf("Has NEON %x\n", has_neon); - } - if (has_mips) { - int has_msa = TestCpuFlag(kCpuHasMSA); - printf("Has MSA %x\n", has_msa); - int has_mmi = TestCpuFlag(kCpuHasMMI); - printf("Has MMI %x\n", has_mmi); - } - if (has_x86) { - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - int has_sse41 = TestCpuFlag(kCpuHasSSE41); - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - int has_avx = TestCpuFlag(kCpuHasAVX); - int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_erms = TestCpuFlag(kCpuHasERMS); - int has_fma3 = TestCpuFlag(kCpuHasFMA3); - int has_f16c = TestCpuFlag(kCpuHasF16C); - int has_gfni = TestCpuFlag(kCpuHasGFNI); - int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); - int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); - int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); - int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); - int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); - int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); - printf("Has SSE2 %x\n", has_sse2); - printf("Has SSSE3 %x\n", has_ssse3); - printf("Has SSE4.1 %x\n", has_sse41); - printf("Has SSE4.2 %x\n", has_sse42); - printf("Has AVX %x\n", has_avx); - printf("Has AVX2 %x\n", has_avx2); - printf("Has ERMS %x\n", has_erms); - printf("Has FMA3 %x\n", has_fma3); - printf("Has F16C %x\n", has_f16c); - printf("Has GFNI %x\n", has_gfni); - printf("Has AVX512BW %x\n", has_avx512bw); - printf("Has AVX512VL %x\n", has_avx512vl); - printf("Has AVX512VBMI %x\n", has_avx512vbmi); - printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2); - printf("Has AVX512VBITALG %x\n", has_avx512vbitalg); - printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq); - } - return 0; -} - diff --git a/thirdparty/libyuv/util/i444tonv12_eg.cc b/thirdparty/libyuv/util/i444tonv12_eg.cc deleted file mode 100644 index 0fcb409..0000000 --- a/thirdparty/libyuv/util/i444tonv12_eg.cc +++ /dev/null @@ -1,28 +0,0 @@ - -#include "libyuv/convert.h" - -#include // for printf -#include // for memset - -int main(int, char**) { - unsigned char src_i444[640 * 400 * 3]; - unsigned char dst_nv12[640 * 400 * 3 / 2]; - - for (size_t i = 0; i < sizeof(src_i444); ++i) { - src_i444[i] = i & 255; - } - memset(dst_nv12, 0, sizeof(dst_nv12)); - libyuv::I444ToNV12(&src_i444[0], 640, // source Y - &src_i444[640 * 400], 640, // source U - &src_i444[640 * 400 * 2], 640, // source V - &dst_nv12[0], 640, // dest Y - &dst_nv12[640 * 400], 640, // dest UV - 640, 400); // width and height - - int checksum = 0; - for (size_t i = 0; i < sizeof(dst_nv12); ++i) { - checksum += dst_nv12[i]; - } - printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL"); - return 0; -} \ No newline at end of file diff --git a/thirdparty/libyuv/util/psnr.cc b/thirdparty/libyuv/util/psnr.cc deleted file mode 100644 index c7bee7f..0000000 --- a/thirdparty/libyuv/util/psnr.cc +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./psnr.h" // NOLINT - -#ifdef _OPENMP -#include -#endif -#ifdef _MSC_VER -#include // For __cpuid() -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef unsigned int uint32_t; // NOLINT -#ifdef _MSC_VER -typedef unsigned __int64 uint64_t; -#else // COMPILER_MSVC -#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long uint64_t; // NOLINT -#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long long uint64_t; // NOLINT -#endif // __LP64__ -#endif // _MSC_VER - -// libyuv provides this function when linking library for jpeg support. -#if !defined(HAVE_JPEG) - -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__aarch64__) -#define HAS_SUMSQUAREERROR_NEON -static uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - volatile uint32_t sse; - asm volatile( - "vmov.u8 q7, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - - "1: \n" - "vld1.u8 {q0}, [%0]! \n" - "vld1.u8 {q1}, [%1]! \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q7, d4, d4 \n" - "vmlal.s16 q8, d6, d6 \n" - "vmlal.s16 q8, d5, d5 \n" - "vmlal.s16 q10, d7, d7 \n" - "subs %2, %2, #16 \n" - "bhi 1b \n" - - "vadd.u32 q7, q7, q8 \n" - "vadd.u32 q9, q9, q10 \n" - "vadd.u32 q10, q7, q9 \n" - "vpaddl.u32 q1, q10 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); - return sse; -} -#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#define HAS_SUMSQUAREERROR_NEON -static uint32_t SumSquareError_NEON(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - volatile uint32_t sse; - asm volatile( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" - - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" - - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); - return sse; -} -#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SUMSQUAREERROR_SSE2 -__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/, - const uint8_t* /*src_b*/, - int /*count*/) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - sub edx, eax - - wloop: - movdqu xmm1, [eax] - movdqu xmm2, [eax + edx] - lea eax, [eax + 16] - movdqu xmm3, xmm1 - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqu xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - ja wloop - - pshufd xmm1, xmm0, 0EEh - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 01h - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} -#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_SUMSQUAREERROR_SSE2 -static uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( // NOLINT - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu (%0,%1,1),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqu %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqu %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - : - : "memory", "cc" -#if defined(__SSE2__) - , - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); // NOLINT - return sse; -} -#endif // LIBYUV_DISABLE_X86 etc - -#if defined(HAS_SUMSQUAREERROR_SSE2) -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -// For gcc/clang but not clangcl. -#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__)) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -#endif - -static int CpuHasSSE2() { -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) - int cpu_info[4]; - __cpuid(cpu_info, 1); - if (cpu_info[3] & 0x04000000) { - return 1; - } -#endif - return 0; -} -#endif // HAS_SUMSQUAREERROR_SSE2 - -static uint32_t SumSquareError_C(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - for (int x = 0; x < count; ++x) { - int diff = src_a[x] - src_b[x]; - sse += static_cast(diff * diff); - } - return sse; -} - -double ComputeSumSquareError(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, - int count) = SumSquareError_C; -#if defined(HAS_SUMSQUAREERROR_NEON) - SumSquareError = SumSquareError_NEON; -#endif -#if defined(HAS_SUMSQUAREERROR_SSE2) - if (CpuHasSSE2()) { - SumSquareError = SumSquareError_SSE2; - } -#endif - const int kBlockSize = 1 << 15; - uint64_t sse = 0; -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : sse) -#endif - for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { - sse += SumSquareError(src_a + i, src_b + i, kBlockSize); - } - src_a += count & ~(kBlockSize - 1); - src_b += count & ~(kBlockSize - 1); - int remainder = count & (kBlockSize - 1) & ~15; - if (remainder) { - sse += SumSquareError(src_a, src_b, remainder); - src_a += remainder; - src_b += remainder; - } - remainder = count & 15; - if (remainder) { - sse += SumSquareError_C(src_a, src_b, remainder); - } - return static_cast(sse); -} -#endif - -// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) -// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). -double ComputePSNR(double sse, double size) { - const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0); - if (sse <= kMINSSE) { - sse = kMINSSE; // Produces max PSNR of 128 - } - return 10.0 * log10(255.0 * 255.0 * size / sse); -} - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/thirdparty/libyuv/util/psnr.h b/thirdparty/libyuv/util/psnr.h deleted file mode 100644 index aac128c..0000000 --- a/thirdparty/libyuv/util/psnr.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format - -#ifndef UTIL_PSNR_H_ // NOLINT -#define UTIL_PSNR_H_ - -#include // For log10() - -#ifdef __cplusplus -extern "C" { -#endif - -#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED) -typedef unsigned char uint8_t; -#define UINT8_TYPE_DEFINED -#endif - -static const double kMaxPSNR = 128.0; - -// libyuv provides this function when linking library for jpeg support. -// TODO(fbarchard): make psnr lib compatible subset of libyuv. -#if !defined(HAVE_JPEG) -// Computer Sum of Squared Error (SSE). -// Pass this to ComputePSNR for final result. -double ComputeSumSquareError(const uint8_t* src_a, - const uint8_t* src_b, - int count); -#endif - -// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) -// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). -double ComputePSNR(double sse, double size); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // UTIL_PSNR_H_ // NOLINT diff --git a/thirdparty/libyuv/util/psnr_main.cc b/thirdparty/libyuv/util/psnr_main.cc deleted file mode 100644 index a930b20..0000000 --- a/thirdparty/libyuv/util/psnr_main.cc +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc -// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc -// -// To enable OpenMP and SSE2 -// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc -// vc: cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc -// -// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec] - -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif - -#include -#include -#include -#include -#ifdef _OPENMP -#include -#endif - -#include "./psnr.h" -#include "./ssim.h" -#ifdef HAVE_JPEG -#include "libyuv/compare.h" -#include "libyuv/convert.h" -#endif - -struct metric { - double y, u, v, all; - double min_y, min_u, min_v, min_all; - double global_y, global_u, global_v, global_all; - int min_frame; -}; - -// options -bool verbose = false; -bool quiet = false; -bool show_name = false; -bool do_swap_uv = false; -bool do_psnr = false; -bool do_ssim = false; -bool do_mse = false; -bool do_lssim = false; -int image_width = 0, image_height = 0; -int fileindex_org = 0; // argv argument contains the source file name. -int fileindex_rec = 0; // argv argument contains the destination file name. -int num_rec = 0; -int num_skip_org = 0; -int num_skip_rec = 0; -int num_frames = 0; -#ifdef _OPENMP -int num_threads = 0; -#endif - -// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv -bool ExtractResolutionFromFilename(const char* name, - int* width_ptr, - int* height_ptr) { - // Isolate the .width_height. section of the filename by searching for a - // dot or underscore followed by a digit. - for (int i = 0; name[i]; ++i) { - if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && - name[i + 1] <= '9') { - int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT - if (2 == n) { - return true; - } - } - } - -#ifdef HAVE_JPEG - // Try parsing file as a jpeg. - FILE* const file_org = fopen(name, "rb"); - if (file_org == NULL) { - fprintf(stderr, "Cannot open %s\n", name); - return false; - } - fseek(file_org, 0, SEEK_END); - size_t total_size = ftell(file_org); - fseek(file_org, 0, SEEK_SET); - uint8_t* const ch_org = new uint8_t[total_size]; - memset(ch_org, 0, total_size); - size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org); - fclose(file_org); - if (bytes_org == total_size) { - if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) { - delete[] ch_org; - return true; - } - } - delete[] ch_org; -#endif // HAVE_JPEG - return false; -} - -// Scale Y channel from 16..240 to 0..255. -// This can be useful when comparing codecs that are inconsistant about Y -uint8_t ScaleY(uint8_t y) { - int ny = (y - 16) * 256 / 224; - if (ny < 0) { - ny = 0; - } - if (ny > 255) { - ny = 255; - } - return static_cast(ny); -} - -// MSE = Mean Square Error -double GetMSE(double sse, double size) { - return sse / size; -} - -void PrintHelp(const char* program) { - printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program); -#ifdef HAVE_JPEG - printf("jpeg or raw YUV 420 supported.\n"); -#endif - printf("options:\n"); - printf( - " -s .... specify YUV size, mandatory if none of the " - "sequences have the\n"); - printf( - " resolution embedded in their filename (ie. " - "name.1920x800_24Hz_P420.yuv)\n"); - printf(" -psnr .................. compute PSNR (default)\n"); - printf(" -ssim .................. compute SSIM\n"); - printf(" -mse ................... compute MSE\n"); - printf(" -swap .................. Swap U and V plane\n"); - printf(" -skip ...... Number of frame to skip of org and rec\n"); - printf(" -frames .......... Number of frames to compare\n"); -#ifdef _OPENMP - printf(" -t ............... Number of threads\n"); -#endif - printf(" -n ..................... Show file name\n"); - printf(" -v ..................... verbose++\n"); - printf(" -q ..................... quiet\n"); - printf(" -h ..................... this help\n"); - exit(0); -} - -void ParseOptions(int argc, const char* argv[]) { - if (argc <= 1) { - PrintHelp(argv[0]); - } - for (int c = 1; c < argc; ++c) { - if (!strcmp(argv[c], "-v")) { - verbose = true; - } else if (!strcmp(argv[c], "-q")) { - quiet = true; - } else if (!strcmp(argv[c], "-n")) { - show_name = true; - } else if (!strcmp(argv[c], "-psnr")) { - do_psnr = true; - } else if (!strcmp(argv[c], "-mse")) { - do_mse = true; - } else if (!strcmp(argv[c], "-ssim")) { - do_ssim = true; - } else if (!strcmp(argv[c], "-lssim")) { - do_ssim = true; - do_lssim = true; - } else if (!strcmp(argv[c], "-swap")) { - do_swap_uv = true; - } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { - PrintHelp(argv[0]); - } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { - image_width = atoi(argv[++c]); // NOLINT - image_height = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) { - num_skip_org = atoi(argv[++c]); // NOLINT - num_skip_rec = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { - num_frames = atoi(argv[++c]); // NOLINT -#ifdef _OPENMP - } else if (!strcmp(argv[c], "-t") && c + 1 < argc) { - num_threads = atoi(argv[++c]); // NOLINT -#endif - } else if (argv[c][0] == '-') { - fprintf(stderr, "Unknown option. %s\n", argv[c]); - } else if (fileindex_org == 0) { - fileindex_org = c; - } else if (fileindex_rec == 0) { - fileindex_rec = c; - num_rec = 1; - } else { - ++num_rec; - } - } - if (fileindex_org == 0 || fileindex_rec == 0) { - fprintf(stderr, "Missing filenames\n"); - PrintHelp(argv[0]); - } - if (num_skip_org < 0 || num_skip_rec < 0) { - fprintf(stderr, "Skipped frames incorrect\n"); - PrintHelp(argv[0]); - } - if (num_frames < 0) { - fprintf(stderr, "Number of frames incorrect\n"); - PrintHelp(argv[0]); - } - if (image_width == 0 || image_height == 0) { - int org_width, org_height; - int rec_width, rec_height; - bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], - &org_width, &org_height); - bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], - &rec_width, &rec_height); - if (org_res_avail) { - if (rec_res_avail) { - if ((org_width == rec_width) && (org_height == rec_height)) { - image_width = org_width; - image_height = org_height; - } else { - fprintf(stderr, "Sequences have different resolutions.\n"); - PrintHelp(argv[0]); - } - } else { - image_width = org_width; - image_height = org_height; - } - } else if (rec_res_avail) { - image_width = rec_width; - image_height = rec_height; - } else { - fprintf(stderr, "Missing dimensions.\n"); - PrintHelp(argv[0]); - } - } -} - -bool UpdateMetrics(uint8_t* ch_org, - uint8_t* ch_rec, - const int y_size, - const int uv_size, - const size_t total_size, - int number_of_frames, - metric* cur_distortion_psnr, - metric* distorted_frame, - bool do_psnr) { - const int uv_offset = (do_swap_uv ? uv_size : 0); - const uint8_t* const u_org = ch_org + y_size + uv_offset; - const uint8_t* const u_rec = ch_rec + y_size; - const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset); - const uint8_t* const v_rec = ch_rec + y_size + uv_size; - if (do_psnr) { -#ifdef HAVE_JPEG - double y_err = static_cast( - libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size)); - double u_err = static_cast( - libyuv::ComputeSumSquareError(u_org, u_rec, uv_size)); - double v_err = static_cast( - libyuv::ComputeSumSquareError(v_org, v_rec, uv_size)); -#else - double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size); - double u_err = ComputeSumSquareError(u_org, u_rec, uv_size); - double v_err = ComputeSumSquareError(v_org, v_rec, uv_size); -#endif - const double total_err = y_err + u_err + v_err; - cur_distortion_psnr->global_y += y_err; - cur_distortion_psnr->global_u += u_err; - cur_distortion_psnr->global_v += v_err; - cur_distortion_psnr->global_all += total_err; - distorted_frame->y = ComputePSNR(y_err, static_cast(y_size)); - distorted_frame->u = ComputePSNR(u_err, static_cast(uv_size)); - distorted_frame->v = ComputePSNR(v_err, static_cast(uv_size)); - distorted_frame->all = - ComputePSNR(total_err, static_cast(total_size)); - } else { - distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height); - distorted_frame->u = - CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2); - distorted_frame->v = - CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2); - distorted_frame->all = - (distorted_frame->y + distorted_frame->u + distorted_frame->v) / - total_size; - distorted_frame->y /= y_size; - distorted_frame->u /= uv_size; - distorted_frame->v /= uv_size; - - if (do_lssim) { - distorted_frame->all = CalcLSSIM(distorted_frame->all); - distorted_frame->y = CalcLSSIM(distorted_frame->y); - distorted_frame->u = CalcLSSIM(distorted_frame->u); - distorted_frame->v = CalcLSSIM(distorted_frame->v); - } - } - - cur_distortion_psnr->y += distorted_frame->y; - cur_distortion_psnr->u += distorted_frame->u; - cur_distortion_psnr->v += distorted_frame->v; - cur_distortion_psnr->all += distorted_frame->all; - - bool ismin = false; - if (distorted_frame->y < cur_distortion_psnr->min_y) { - cur_distortion_psnr->min_y = distorted_frame->y; - } - if (distorted_frame->u < cur_distortion_psnr->min_u) { - cur_distortion_psnr->min_u = distorted_frame->u; - } - if (distorted_frame->v < cur_distortion_psnr->min_v) { - cur_distortion_psnr->min_v = distorted_frame->v; - } - if (distorted_frame->all < cur_distortion_psnr->min_all) { - cur_distortion_psnr->min_all = distorted_frame->all; - cur_distortion_psnr->min_frame = number_of_frames; - ismin = true; - } - return ismin; -} - -int main(int argc, const char* argv[]) { - ParseOptions(argc, argv); - if (!do_psnr && !do_ssim) { - do_psnr = true; - } - -#ifdef _OPENMP - if (num_threads) { - omp_set_num_threads(num_threads); - } - if (verbose) { - printf("OpenMP %d procs\n", omp_get_num_procs()); - } -#endif - // Open original file (first file argument) - FILE* const file_org = fopen(argv[fileindex_org], "rb"); - if (file_org == NULL) { - fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]); - exit(1); - } - - // Open all files to compare to - FILE** file_rec = new FILE*[num_rec]; - memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb"); - if (file_rec[cur_rec] == NULL) { - fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]); - fclose(file_org); - for (int i = 0; i < cur_rec; ++i) { - fclose(file_rec[i]); - } - delete[] file_rec; - exit(1); - } - } - - const int y_size = image_width * image_height; - const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2); - const size_t total_size = y_size + 2 * uv_size; // NOLINT -#if defined(_MSC_VER) - _fseeki64( - file_org, - static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size), - SEEK_SET); -#else - fseek(file_org, num_skip_org * total_size, SEEK_SET); -#endif - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { -#if defined(_MSC_VER) - _fseeki64( - file_rec[cur_rec], - static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size), - SEEK_SET); -#else - fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET); -#endif - } - - uint8_t* const ch_org = new uint8_t[total_size]; - uint8_t* const ch_rec = new uint8_t[total_size]; - if (ch_org == NULL || ch_rec == NULL) { - fprintf(stderr, "No memory available\n"); - fclose(file_org); - for (int i = 0; i < num_rec; ++i) { - fclose(file_rec[i]); - } - delete[] ch_org; - delete[] ch_rec; - delete[] file_rec; - exit(1); - } - - metric* const distortion_psnr = new metric[num_rec]; - metric* const distortion_ssim = new metric[num_rec]; - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; - cur_distortion_psnr->y = 0.0; - cur_distortion_psnr->u = 0.0; - cur_distortion_psnr->v = 0.0; - cur_distortion_psnr->all = 0.0; - cur_distortion_psnr->min_y = kMaxPSNR; - cur_distortion_psnr->min_u = kMaxPSNR; - cur_distortion_psnr->min_v = kMaxPSNR; - cur_distortion_psnr->min_all = kMaxPSNR; - cur_distortion_psnr->min_frame = 0; - cur_distortion_psnr->global_y = 0.0; - cur_distortion_psnr->global_u = 0.0; - cur_distortion_psnr->global_v = 0.0; - cur_distortion_psnr->global_all = 0.0; - distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec]; - } - - if (verbose) { - printf("Size: %dx%d\n", image_width, image_height); - } - - if (!quiet) { - printf("Frame"); - if (do_psnr) { - printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame"); - } - if (do_ssim) { - printf("\t SSIM-Y\t SSIM-U\t SSIM-V\t SSIM-All\t Frame"); - } - if (show_name) { - printf("\tName\n"); - } else { - printf("\n"); - } - } - - int number_of_frames; - for (number_of_frames = 0;; ++number_of_frames) { - if (num_frames && number_of_frames >= num_frames) { - break; - } - - size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org); - if (bytes_org < total_size) { -#ifdef HAVE_JPEG - // Try parsing file as a jpeg. - uint8_t* const ch_jpeg = new uint8_t[bytes_org]; - memcpy(ch_jpeg, ch_org, bytes_org); - memset(ch_org, 0, total_size); - - if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width, - ch_org + y_size, (image_width + 1) / 2, - ch_org + y_size + uv_size, - (image_width + 1) / 2, image_width, - image_height, image_width, image_height)) { - delete[] ch_jpeg; - break; - } - delete[] ch_jpeg; -#else - break; -#endif // HAVE_JPEG - } - - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - size_t bytes_rec = - fread(ch_rec, sizeof(uint8_t), total_size, file_rec[cur_rec]); - if (bytes_rec < total_size) { -#ifdef HAVE_JPEG - // Try parsing file as a jpeg. - uint8_t* const ch_jpeg = new uint8_t[bytes_rec]; - memcpy(ch_jpeg, ch_rec, bytes_rec); - memset(ch_rec, 0, total_size); - - if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width, - ch_rec + y_size, (image_width + 1) / 2, - ch_rec + y_size + uv_size, - (image_width + 1) / 2, image_width, - image_height, image_width, image_height)) { - delete[] ch_jpeg; - break; - } - delete[] ch_jpeg; -#else - break; -#endif // HAVE_JPEG - } - - if (verbose) { - printf("%5d", number_of_frames); - } - if (do_psnr) { - metric distorted_frame = {}; - metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; - bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, - number_of_frames, cur_distortion_psnr, - &distorted_frame, true); - if (verbose) { - printf("\t%10.6f", distorted_frame.y); - printf("\t%10.6f", distorted_frame.u); - printf("\t%10.6f", distorted_frame.v); - printf("\t%10.6f", distorted_frame.all); - printf("\t%5s", ismin ? "min" : ""); - } - } - if (do_ssim) { - metric distorted_frame = {}; - metric* cur_distortion_ssim = &distortion_ssim[cur_rec]; - bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, - number_of_frames, cur_distortion_ssim, - &distorted_frame, false); - if (verbose) { - printf("\t%10.6f", distorted_frame.y); - printf("\t%10.6f", distorted_frame.u); - printf("\t%10.6f", distorted_frame.v); - printf("\t%10.6f", distorted_frame.all); - printf("\t%5s", ismin ? "min" : ""); - } - } - if (verbose) { - if (show_name) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - } - printf("\n"); - } - } - } - - // Final PSNR computation. - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; - metric* cur_distortion_ssim = &distortion_ssim[cur_rec]; - if (number_of_frames > 0) { - const double norm = 1. / static_cast(number_of_frames); - cur_distortion_psnr->y *= norm; - cur_distortion_psnr->u *= norm; - cur_distortion_psnr->v *= norm; - cur_distortion_psnr->all *= norm; - cur_distortion_ssim->y *= norm; - cur_distortion_ssim->u *= norm; - cur_distortion_ssim->v *= norm; - cur_distortion_ssim->all *= norm; - } - - if (do_psnr) { - const double global_psnr_y = - ComputePSNR(cur_distortion_psnr->global_y, - static_cast(y_size) * number_of_frames); - const double global_psnr_u = - ComputePSNR(cur_distortion_psnr->global_u, - static_cast(uv_size) * number_of_frames); - const double global_psnr_v = - ComputePSNR(cur_distortion_psnr->global_v, - static_cast(uv_size) * number_of_frames); - const double global_psnr_all = - ComputePSNR(cur_distortion_psnr->global_all, - static_cast(total_size) * number_of_frames); - printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y, - global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames); - if (show_name) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - } - printf("\n"); - } - - if (!quiet) { - printf("Avg:"); - if (do_psnr) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y, - cur_distortion_psnr->u, cur_distortion_psnr->v, - cur_distortion_psnr->all, number_of_frames); - } - if (do_ssim) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y, - cur_distortion_ssim->u, cur_distortion_ssim->v, - cur_distortion_ssim->all, number_of_frames); - } - if (show_name) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - } - printf("\n"); - } - if (!quiet) { - printf("Min:"); - if (do_psnr) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_psnr->min_y, cur_distortion_psnr->min_u, - cur_distortion_psnr->min_v, cur_distortion_psnr->min_all, - cur_distortion_psnr->min_frame); - } - if (do_ssim) { - printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", - cur_distortion_ssim->min_y, cur_distortion_ssim->min_u, - cur_distortion_ssim->min_v, cur_distortion_ssim->min_all, - cur_distortion_ssim->min_frame); - } - if (show_name) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - } - printf("\n"); - } - - if (do_mse) { - double global_mse_y = - GetMSE(cur_distortion_psnr->global_y, - static_cast(y_size) * number_of_frames); - double global_mse_u = - GetMSE(cur_distortion_psnr->global_u, - static_cast(uv_size) * number_of_frames); - double global_mse_v = - GetMSE(cur_distortion_psnr->global_v, - static_cast(uv_size) * number_of_frames); - double global_mse_all = - GetMSE(cur_distortion_psnr->global_all, - static_cast(total_size) * number_of_frames); - printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y, - global_mse_u, global_mse_v, global_mse_all, number_of_frames); - if (show_name) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - } - printf("\n"); - } - } - fclose(file_org); - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - fclose(file_rec[cur_rec]); - } - delete[] distortion_psnr; - delete[] distortion_ssim; - delete[] ch_org; - delete[] ch_rec; - delete[] file_rec; - return 0; -} diff --git a/thirdparty/libyuv/util/ssim.cc b/thirdparty/libyuv/util/ssim.cc deleted file mode 100644 index 096fbcf..0000000 --- a/thirdparty/libyuv/util/ssim.cc +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "../util/ssim.h" // NOLINT - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef unsigned int uint32_t; // NOLINT -typedef unsigned short uint16_t; // NOLINT - -#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ - (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) -#define __SSE2__ -#endif -#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) -#include -#endif - -#ifdef _OPENMP -#include -#endif - -// SSIM -enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; - -// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) -// The maximum value (11 x 11) must be less than 128 to avoid sign -// problems during the calls to _mm_mullo_epi16(). -static const int K[KERNEL_SIZE] = { - 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) -}; -static const double kiW[KERNEL + 1 + 1] = { - 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] - 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] - 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] - 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] - 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] -}; - -#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) - -#define PWEIGHT(A, B) static_cast(K[(A)] * K[(B)]) // weight product -#define MAKE_WEIGHT(L) \ - { \ - { \ - { \ - PWEIGHT(L, 0) \ - , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \ - PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \ - } \ - } \ - } - -// We need this union trick to be able to initialize constant static __m128i -// values. We can't call _mm_set_epi16() for static compile-time initialization. -static const struct { - union { - uint16_t i16_[8]; - __m128i m_; - } values_; -} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2), - W3 = MAKE_WEIGHT(3); -// ... the rest is symmetric. -#undef MAKE_WEIGHT -#undef PWEIGHT -#endif - -// Common final expression for SSIM, once the weighted sums are known. -static double FinalizeSSIM(double iw, - double xm, - double ym, - double xxm, - double xym, - double yym) { - const double iwx = xm * iw; - const double iwy = ym * iw; - double sxx = xxm * iw - iwx * iwx; - double syy = yym * iw - iwy * iwy; - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) { - sxx = 0.; - } - if (syy < 0.) { - syy = 0.; - } - const double sxsy = sqrt(sxx * syy); - const double sxy = xym * iw - iwx * iwy; - static const double C11 = (0.01 * 0.01) * (255 * 255); - static const double C22 = (0.03 * 0.03) * (255 * 255); - static const double C33 = (0.015 * 0.015) * (255 * 255); - const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); - const double c = (2. * sxsy + C22) / (sxx + syy + C22); - const double s = (sxy + C33) / (sxsy + C33); - return l * c * s; -} - -// GetSSIM() does clipping. GetSSIMFullKernel() does not - -// TODO(skal): use summed tables? -// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) -// with a diff of 255, squared. The maximum error is thus 0x4388241, -// which fits into 32 bits integers. -double GetSSIM(const uint8_t* org, - const uint8_t* rec, - int xo, - int yo, - int W, - int H, - int stride) { - uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; - org += (yo - KERNEL) * stride; - org += (xo - KERNEL); - rec += (yo - KERNEL) * stride; - rec += (xo - KERNEL); - for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { - if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) { - continue; - } - const int Wy = K[y_]; - for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { - const int Wxy = Wy * K[x_]; - if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { - const int org_x = org[x_]; - const int rec_x = rec[x_]; - ws += Wxy; - xm += Wxy * org_x; - ym += Wxy * rec_x; - xxm += Wxy * org_x * org_x; - xym += Wxy * org_x * rec_x; - yym += Wxy * rec_x * rec_x; - } - } - } - return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); -} - -double GetSSIMFullKernel(const uint8_t* org, - const uint8_t* rec, - int xo, - int yo, - int stride, - double area_weight) { - uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; - -#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) - - org += yo * stride + xo; - rec += yo * stride + xo; - for (int y = 1; y <= KERNEL; y++) { - const int dy1 = y * stride; - const int dy2 = y * stride; - const int Wy = K[KERNEL + y]; - - for (int x = 1; x <= KERNEL; x++) { - // Compute the contributions of upper-left (ul), upper-right (ur) - // lower-left (ll) and lower-right (lr) points (see the diagram below). - // Symmetric Kernel will have same weight on those points. - // - - - - - - - - // - ul - - - ur - - // - - - - - - - - // - - - 0 - - - - // - - - - - - - - // - ll - - - lr - - // - - - - - - - - const int Wxy = Wy * K[KERNEL + x]; - const int ul1 = org[-dy1 - x]; - const int ur1 = org[-dy1 + x]; - const int ll1 = org[dy1 - x]; - const int lr1 = org[dy1 + x]; - - const int ul2 = rec[-dy2 - x]; - const int ur2 = rec[-dy2 + x]; - const int ll2 = rec[dy2 - x]; - const int lr2 = rec[dy2 + x]; - - xm += Wxy * (ul1 + ur1 + ll1 + lr1); - ym += Wxy * (ul2 + ur2 + ll2 + lr2); - xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); - xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); - yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); - } - - // Compute the contributions of up (u), down (d), left (l) and right (r) - // points across the main axes (see the diagram below). - // Symmetric Kernel will have same weight on those points. - // - - - - - - - - // - - - u - - - - // - - - - - - - - // - l - 0 - r - - // - - - - - - - - // - - - d - - - - // - - - - - - - - const int Wxy = Wy * K[KERNEL]; - const int u1 = org[-dy1]; - const int d1 = org[dy1]; - const int l1 = org[-y]; - const int r1 = org[y]; - - const int u2 = rec[-dy2]; - const int d2 = rec[dy2]; - const int l2 = rec[-y]; - const int r2 = rec[y]; - - xm += Wxy * (u1 + d1 + l1 + r1); - ym += Wxy * (u2 + d2 + l2 + r2); - xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); - xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); - yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); - } - - // Lastly the contribution of (x0, y0) point. - const int Wxy = K[KERNEL] * K[KERNEL]; - const int s1 = org[0]; - const int s2 = rec[0]; - - xm += Wxy * s1; - ym += Wxy * s2; - xxm += Wxy * s1 * s1; - xym += Wxy * s1 * s2; - yym += Wxy * s2 * s2; - -#else // __SSE2__ - - org += (yo - KERNEL) * stride + (xo - KERNEL); - rec += (yo - KERNEL) * stride + (xo - KERNEL); - - const __m128i zero = _mm_setzero_si128(); - __m128i x = zero; - __m128i y = zero; - __m128i xx = zero; - __m128i xy = zero; - __m128i yy = zero; - -// Read 8 pixels at line #L, and convert to 16bit, perform weighting -// and acccumulate. -#define LOAD_LINE_PAIR(L, WEIGHT) \ - do { \ - const __m128i v0 = \ - _mm_loadl_epi64(reinterpret_cast(org + (L)*stride)); \ - const __m128i v1 = \ - _mm_loadl_epi64(reinterpret_cast(rec + (L)*stride)); \ - const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ - const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ - const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ - const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ - x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ - x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ - xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ - xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ - yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ - } while (0) - -#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \ - do { \ - uint32_t tmp[4]; \ - _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ - (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ - } while (0) - - LOAD_LINE_PAIR(0, W0); - LOAD_LINE_PAIR(1, W1); - LOAD_LINE_PAIR(2, W2); - LOAD_LINE_PAIR(3, W3); - LOAD_LINE_PAIR(4, W2); - LOAD_LINE_PAIR(5, W1); - LOAD_LINE_PAIR(6, W0); - - ADD_AND_STORE_FOUR_EPI32(x, xm); - ADD_AND_STORE_FOUR_EPI32(y, ym); - ADD_AND_STORE_FOUR_EPI32(xx, xxm); - ADD_AND_STORE_FOUR_EPI32(xy, xym); - ADD_AND_STORE_FOUR_EPI32(yy, yym); - -#undef LOAD_LINE_PAIR -#undef ADD_AND_STORE_FOUR_EPI32 -#endif - - return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); -} - -static int start_max(int x, int y) { - return (x > y) ? x : y; -} - -double CalcSSIM(const uint8_t* org, - const uint8_t* rec, - const int image_width, - const int image_height) { - double SSIM = 0.; - const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; - const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; - const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); - const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); - const int stride = image_width; - - for (int j = 0; j < KERNEL_Y; ++j) { - for (int i = 0; i < image_width; ++i) { - SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); - } - } - -#ifdef _OPENMP -#pragma omp parallel for reduction(+ : SSIM) -#endif - for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { - for (int i = 0; i < KERNEL_X; ++i) { - SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); - } - for (int i = KERNEL_X; i < start_x; ++i) { - SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); - } - if (start_x < image_width) { - // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we - // copy the 8 rightmost pixels on a cache area, and pad this area with - // zeros which won't contribute to the overall SSIM value (but we need - // to pass the correct normalizing constant!). By using this cache, we can - // still call GetSSIMFullKernel() instead of the slower GetSSIM(). - // NOTE: we could use similar method for the left-most pixels too. - const int kScratchWidth = 8; - const int kScratchStride = kScratchWidth + KERNEL + 1; - uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0}; - uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0}; - - for (int k = 0; k < KERNEL_SIZE; ++k) { - const int offset = - (j - KERNEL + k) * stride + image_width - kScratchWidth; - memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); - memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); - } - for (int k = 0; k <= KERNEL_X + 1; ++k) { - SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL, - kScratchStride, kiW[k]); - } - } - } - - for (int j = start_y; j < image_height; ++j) { - for (int i = 0; i < image_width; ++i) { - SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); - } - } - return SSIM; -} - -double CalcLSSIM(double ssim) { - return -10.0 * log10(1.0 - ssim); -} - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/thirdparty/libyuv/util/ssim.h b/thirdparty/libyuv/util/ssim.h deleted file mode 100644 index a855f1d..0000000 --- a/thirdparty/libyuv/util/ssim.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format - -#ifndef UTIL_SSIM_H_ -#define UTIL_SSIM_H_ - -#include // For log10() - -#ifdef __cplusplus -extern "C" { -#endif - -#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED) -typedef unsigned char uint8_t; -#define UINT8_TYPE_DEFINED -#endif - -double CalcSSIM(const uint8_t* org, - const uint8_t* rec, - const int image_width, - const int image_height); - -double CalcLSSIM(double ssim); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // UTIL_SSIM_H_ diff --git a/thirdparty/libyuv/util/yuvconstants.c b/thirdparty/libyuv/util/yuvconstants.c deleted file mode 100644 index 037e082..0000000 --- a/thirdparty/libyuv/util/yuvconstants.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2021 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include - -// This utility computes values needed to generate yuvconstants based on -// white point values. -// The yuv formulas are tuned for 8 bit YUV channels. - -// See Also -// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/ - -// BT.709 full range YUV to RGB reference -// R = Y + V * 1.5748 -// G = Y - U * 0.18732 - V * 0.46812 -// B = Y + U * 1.8556 -// KR = 0.2126 -// KB = 0.0722 - -// // Y contribution to R,G,B. Scale and bias. -// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -// #define YB 32 /* 64 / 2 */ -// -// // U and V contributions to R,G,B. -// #define UB 113 /* round(1.77200 * 64) */ -// #define UG 22 /* round(0.34414 * 64) */ -// #define VG 46 /* round(0.71414 * 64) */ -// #define VR 90 /* round(1.40200 * 64) */ -// -// // Bias values to round, and subtract 128 from U and V. -// #define BB (-UB * 128 + YB) -// #define BG (UG * 128 + VG * 128 + YB) -// #define BR (-VR * 128 + YB) - -int main(int argc, const char* argv[]) { - if (argc < 2) { - printf("yuvconstants Kr Kb\n"); - printf(" MC BT KR = 0.2126; KB = 0.0722\n"); - printf(" 1 BT.709 KR = 0.2126; KB = 0.0722\n"); - printf(" 4 FCC KR = 0.30; KB = 0.11\n"); - printf(" 6 BT.601 KR = 0.299; KB = 0.114\n"); - printf(" 7 SMPTE 240M KR = 0.212; KB = 0.087\n"); - printf(" 9 BT.2020 KR = 0.2627; KB = 0.0593\n"); - return -1; - } - float kr = atof(argv[1]); - float kb = atof(argv[2]); - float kg = 1 - kr - kb; - - float vr = 2 * (1 - kr); - float ug = 2 * ((1 - kb) * kb / kg); - float vg = 2 * ((1 - kr) * kr / kg); - float ub = 2 * (1 - kb); - - printf("Full range\n"); - printf("R = Y + V * %5f\n", vr); - printf("G = Y - U * %6f - V * %6f\n", ug, vg); - printf("B = Y + U * %5f\n", ub); - - printf("KR = %4f; ", kr); - printf("KB = %4f\n", kb); - // printf("KG = %4f\n", kg); - // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ - // #define YB 32 /* 64 / 2 */ - // - // // U and V contributions to R,G,B. - - printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); - printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); - printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); - printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); - - vr = 255.f / 224.f * 2 * (1 - kr); - ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg); - vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg); - ub = 255.f / 224.f * 2 * (1 - kb); - - printf("\nLimited range\n"); - printf("R = (Y - 16) * 1.164 + V * %5f\n", vr); - printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg); - printf("B = (Y - 16) * 1.164 + U * %5f\n", ub); - - // printf("KG = %4f\n", kg); - // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ - // #define YB 32 /* 64 / 2 */ - // - // // U and V contributions to R,G,B. - - printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); - printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); - printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); - printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); - - return 0; -} diff --git a/thirdparty/libyuv/util/yuvconvert.cc b/thirdparty/libyuv/util/yuvconvert.cc deleted file mode 100644 index 27cdfe9..0000000 --- a/thirdparty/libyuv/util/yuvconvert.cc +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Convert an ARGB image to YUV. -// Usage: yuvconvert src_argb.raw dst_yuv.raw - -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif - -#include -#include -#include -#include - -#include "libyuv/convert.h" -#include "libyuv/planar_functions.h" -#include "libyuv/scale_argb.h" - -// options -bool verbose = false; -bool attenuate = false; -bool unattenuate = false; -int image_width = 0, image_height = 0; // original width and height -int dst_width = 0, dst_height = 0; // new width and height -int fileindex_org = 0; // argv argument contains the original file name. -int fileindex_rec = 0; // argv argument contains the reconstructed file name. -int num_rec = 0; // Number of reconstructed images. -int num_skip_org = 0; // Number of frames to skip in original. -int num_frames = 0; // Number of frames to convert. -int filter = 1; // Bilinear filter for scaling. - -static __inline uint32_t Abs(int32_t v) { - return v >= 0 ? v : -v; -} - -// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv -bool ExtractResolutionFromFilename(const char* name, - int* width_ptr, - int* height_ptr) { - // Isolate the .width_height. section of the filename by searching for a - // dot or underscore followed by a digit. - for (int i = 0; name[i]; ++i) { - if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && - name[i + 1] <= '9') { - int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT - if (2 == n) { - return true; - } - } - } - return false; -} - -void PrintHelp(const char* program) { - printf("%s [-options] src_argb.raw dst_yuv.raw\n", program); - printf( - " -s .... specify source resolution. " - "Optional if name contains\n" - " resolution (ie. " - "name.1920x800_24Hz_P420.yuv)\n" - " Negative value mirrors.\n"); - printf(" -d .... specify destination resolution.\n"); - printf(" -f ............ 0 = point, 1 = bilinear (default).\n"); - printf(" -skip ....... Number of frame to skip of src_argb\n"); - printf(" -frames .......... Number of frames to convert\n"); - printf(" -attenuate ............. Attenuate the ARGB image\n"); - printf(" -unattenuate ........... Unattenuate the ARGB image\n"); - printf(" -v ..................... verbose\n"); - printf(" -h ..................... this help\n"); - exit(0); -} - -void ParseOptions(int argc, const char* argv[]) { - if (argc <= 1) { - PrintHelp(argv[0]); - } - for (int c = 1; c < argc; ++c) { - if (!strcmp(argv[c], "-v")) { - verbose = true; - } else if (!strcmp(argv[c], "-attenuate")) { - attenuate = true; - } else if (!strcmp(argv[c], "-unattenuate")) { - unattenuate = true; - } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { - PrintHelp(argv[0]); - } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { - image_width = atoi(argv[++c]); // NOLINT - image_height = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-d") && c + 2 < argc) { - dst_width = atoi(argv[++c]); // NOLINT - dst_height = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) { - num_skip_org = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { - num_frames = atoi(argv[++c]); // NOLINT - } else if (!strcmp(argv[c], "-f") && c + 1 < argc) { - filter = atoi(argv[++c]); // NOLINT - } else if (argv[c][0] == '-') { - fprintf(stderr, "Unknown option. %s\n", argv[c]); - } else if (fileindex_org == 0) { - fileindex_org = c; - } else if (fileindex_rec == 0) { - fileindex_rec = c; - num_rec = 1; - } else { - ++num_rec; - } - } - if (fileindex_org == 0 || fileindex_rec == 0) { - fprintf(stderr, "Missing filenames\n"); - PrintHelp(argv[0]); - } - if (num_skip_org < 0) { - fprintf(stderr, "Skipped frames incorrect\n"); - PrintHelp(argv[0]); - } - if (num_frames < 0) { - fprintf(stderr, "Number of frames incorrect\n"); - PrintHelp(argv[0]); - } - - int org_width, org_height; - int rec_width, rec_height; - bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], - &org_width, &org_height); - bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], - &rec_width, &rec_height); - if (image_width == 0 || image_height == 0) { - if (org_res_avail) { - image_width = org_width; - image_height = org_height; - } else if (rec_res_avail) { - image_width = rec_width; - image_height = rec_height; - } else { - fprintf(stderr, "Missing dimensions.\n"); - PrintHelp(argv[0]); - } - } - if (dst_width == 0 || dst_height == 0) { - if (rec_res_avail) { - dst_width = rec_width; - dst_height = rec_height; - } else { - dst_width = Abs(image_width); - dst_height = Abs(image_height); - } - } -} - -static const int kTileX = 32; -static const int kTileY = 32; - -static int TileARGBScale(const uint8_t* src_argb, - int src_stride_argb, - int src_width, - int src_height, - uint8_t* dst_argb, - int dst_stride_argb, - int dst_width, - int dst_height, - libyuv::FilterMode filtering) { - for (int y = 0; y < dst_height; y += kTileY) { - for (int x = 0; x < dst_width; x += kTileX) { - int clip_width = kTileX; - if (x + clip_width > dst_width) { - clip_width = dst_width - x; - } - int clip_height = kTileY; - if (y + clip_height > dst_height) { - clip_height = dst_height - y; - } - int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width, - src_height, dst_argb, dst_stride_argb, - dst_width, dst_height, x, y, clip_width, - clip_height, filtering); - if (r) { - return r; - } - } - } - return 0; -} - -int main(int argc, const char* argv[]) { - ParseOptions(argc, argv); - - // Open original file (first file argument) - FILE* const file_org = fopen(argv[fileindex_org], "rb"); - if (file_org == NULL) { - fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]); - exit(1); - } - - // Open all files to convert to - FILE** file_rec = new FILE*[num_rec]; - memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb"); - if (file_rec[cur_rec] == NULL) { - fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]); - fclose(file_org); - for (int i = 0; i < cur_rec; ++i) { - fclose(file_rec[i]); - } - delete[] file_rec; - exit(1); - } - } - - bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL; - bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL; - if (!org_is_yuv && !org_is_argb) { - fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]); - exit(1); - } - int org_size = Abs(image_width) * Abs(image_height) * 4; // ARGB - // Input is YUV - if (org_is_yuv) { - const int y_size = Abs(image_width) * Abs(image_height); - const int uv_size = - ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2); - org_size = y_size + 2 * uv_size; // YUV original. - } - - const int dst_size = dst_width * dst_height * 4; // ARGB scaled - const int y_size = dst_width * dst_height; - const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2); - const size_t total_size = y_size + 2 * uv_size; -#if defined(_MSC_VER) - _fseeki64(file_org, - static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size), - SEEK_SET); -#else - fseek(file_org, num_skip_org * total_size, SEEK_SET); -#endif - - uint8_t* const ch_org = new uint8_t[org_size]; - uint8_t* const ch_dst = new uint8_t[dst_size]; - uint8_t* const ch_rec = new uint8_t[total_size]; - if (ch_org == NULL || ch_rec == NULL) { - fprintf(stderr, "No memory available\n"); - fclose(file_org); - for (int i = 0; i < num_rec; ++i) { - fclose(file_rec[i]); - } - delete[] ch_org; - delete[] ch_dst; - delete[] ch_rec; - delete[] file_rec; - exit(1); - } - - if (verbose) { - printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width, - dst_height); - } - - int number_of_frames; - for (number_of_frames = 0;; ++number_of_frames) { - if (num_frames && number_of_frames >= num_frames) { - break; - } - - // Load original YUV or ARGB frame. - size_t bytes_org = - fread(ch_org, sizeof(uint8_t), static_cast(org_size), file_org); - if (bytes_org < static_cast(org_size)) { - break; - } - - // TODO(fbarchard): Attenuate doesnt need to know dimensions. - // ARGB attenuate frame - if (org_is_argb && attenuate) { - libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1); - } - // ARGB unattenuate frame - if (org_is_argb && unattenuate) { - libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1); - } - - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - // Scale YUV or ARGB frame. - if (org_is_yuv) { - int src_width = Abs(image_width); - int src_height = Abs(image_height); - int half_src_width = (src_width + 1) / 2; - int half_src_height = (src_height + 1) / 2; - int half_dst_width = (dst_width + 1) / 2; - int half_dst_height = (dst_height + 1) / 2; - I420Scale( - ch_org, src_width, ch_org + src_width * src_height, half_src_width, - ch_org + src_width * src_height + half_src_width * half_src_height, - half_src_width, image_width, image_height, ch_rec, dst_width, - ch_rec + dst_width * dst_height, half_dst_width, - ch_rec + dst_width * dst_height + half_dst_width * half_dst_height, - half_dst_width, dst_width, dst_height, - static_cast(filter)); - } else { - TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height, - ch_dst, dst_width * 4, dst_width, dst_height, - static_cast(filter)); - } - bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL; - bool rec_is_argb = - strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL; - if (!rec_is_yuv && !rec_is_argb) { - fprintf(stderr, "Output format unknown %s\n", - argv[fileindex_rec + cur_rec]); - continue; // Advance to next file. - } - - // Convert ARGB to YUV. - if (!org_is_yuv && rec_is_yuv) { - int half_width = (dst_width + 1) / 2; - int half_height = (dst_height + 1) / 2; - libyuv::ARGBToI420( - ch_dst, dst_width * 4, ch_rec, dst_width, - ch_rec + dst_width * dst_height, half_width, - ch_rec + dst_width * dst_height + half_width * half_height, - half_width, dst_width, dst_height); - } - - // Output YUV or ARGB frame. - if (rec_is_yuv) { - size_t bytes_rec = - fwrite(ch_rec, sizeof(uint8_t), static_cast(total_size), - file_rec[cur_rec]); - if (bytes_rec < static_cast(total_size)) { - break; - } - } else { - size_t bytes_rec = - fwrite(ch_dst, sizeof(uint8_t), static_cast(dst_size), - file_rec[cur_rec]); - if (bytes_rec < static_cast(dst_size)) { - break; - } - } - if (verbose) { - printf("%5d", number_of_frames); - } - if (verbose) { - printf("\t%s", argv[fileindex_rec + cur_rec]); - printf("\n"); - } - } - } - - fclose(file_org); - for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { - fclose(file_rec[cur_rec]); - } - delete[] ch_org; - delete[] ch_dst; - delete[] ch_rec; - delete[] file_rec; - return 0; -} diff --git a/thirdparty/libyuv/winarm.mk b/thirdparty/libyuv/winarm.mk deleted file mode 100644 index b0a344a..0000000 --- a/thirdparty/libyuv/winarm.mk +++ /dev/null @@ -1,47 +0,0 @@ -# This is a generic makefile for libyuv for Windows Arm. -# call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat" -# nmake /f winarm.mk -# make -f winarm.mk -# nmake /f winarm.mk clean -# consider /arch:ARMv7VE -CC=cl -CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP -AR=lib -ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE -RM=cmd /c del - -LOCAL_OBJ_FILES = \ - source/compare.o\ - source/compare_common.o\ - source/convert.o\ - source/convert_argb.o\ - source/convert_from.o\ - source/convert_from_argb.o\ - source/convert_to_argb.o\ - source/convert_to_i420.o\ - source/cpu_id.o\ - source/planar_functions.o\ - source/rotate.o\ - source/rotate_any.o\ - source/rotate_argb.o\ - source/rotate_common.o\ - source/row_any.o\ - source/row_common.o\ - source/scale.o\ - source/scale_any.o\ - source/scale_argb.o\ - source/scale_common.o\ - source/scale_uv.o\ - source/video_common.o - -.cc.o: - $(CC) /c $(CCFLAGS) $*.cc /Fo$@ - -all: libyuv_arm.lib winarm.mk - -libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk - $(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES) - -clean: - $(RM) "source\*.o" libyuv_arm.lib - diff --git a/thirdparty/libyuv/xmake.lua b/thirdparty/libyuv/xmake.lua index bb6e1a9..003a8f5 100644 --- a/thirdparty/libyuv/xmake.lua +++ b/thirdparty/libyuv/xmake.lua @@ -3,19 +3,16 @@ package("libyuv") set_homepage("https://chromium.googlesource.com/libyuv/libyuv/") set_description("libyuv is an open source project that includes YUV scaling and conversion functionality.") set_license("BSD-3-Clause") - -- add_versions("20210528", "eb6e7bb63738e29efd82ea3cf2a115238a89fa51") + set_urls("https://chromium.googlesource.com/libyuv/libyuv.git") + add_versions("2024.5.21", "8e18fc93c8c07d2ba6f9671281d6f35c8c47b2f4") - -- set_urls("https://chromium.googlesource.com/libyuv/libyuv.git") - -- add_versions("2023.10.27", "31e1d6f896615342d5d5b6bde8f7b50b3fd698dc") - - set_sourcedir(os.scriptdir()) add_deps("cmake") on_install("windows", "linux", "macosx", "android", "cross", "bsd", "mingw", function (package) local configs = {"-DTEST=OFF"} table.insert(configs, "-DCMAKE_BUILD_TYPE=" .. (package:debug() and "Debug" or "Release")) - io.replace("CMakeLists.txt", "INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin )", "", {plain = true}) + io.replace("CMakeLists.txt", "INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin )", "", {plain = true}) import("package.tools.cmake").install(package, configs) if package:is_plat("macosx", "linux", "android") then @@ -26,3 +23,7 @@ package("libyuv") end end end) + + on_test(function (package) + assert(package:has_cfuncs("I420Rotate", {includes = "libyuv/rotate.h"})) + end) \ No newline at end of file diff --git a/thirdparty/xmake.lua b/thirdparty/xmake.lua index 40e3c60..288faf9 100644 --- a/thirdparty/xmake.lua +++ b/thirdparty/xmake.lua @@ -1 +1 @@ -includes("openfec", "libyuv") \ No newline at end of file +includes("openfec", "libyuv", "aom") \ No newline at end of file diff --git a/xmake.lua b/xmake.lua index f8c932a..9a6ee77 100644 --- a/xmake.lua +++ b/xmake.lua @@ -11,31 +11,28 @@ add_defines("ASIO_STANDALONE", "ASIO_HAS_STD_TYPE_TRAITS", "ASIO_HAS_STD_SHARED_ "ASIO_HAS_STD_ADDRESSOF", "ASIO_HAS_STD_ATOMIC", "ASIO_HAS_STD_CHRONO", "ASIO_HAS_CSTDINT", "ASIO_HAS_STD_ARRAY", "ASIO_HAS_STD_SYSTEM_ERROR") -add_requires("asio 1.24.0", "nlohmann_json", "spdlog 1.11.0", "openfec", "libopus 1.4", "dav1d 1.1.0", "libyuv") -add_packages("asio", "nlohmann_json", "spdlog", "openfec", "libopus", "dav1d", "libyuv") +add_requires("asio 1.24.0", "nlohmann_json", "spdlog 1.11.0", "openfec", "libopus 1.4", "dav1d 1.1.0", "libyuv", "aom") +add_packages("asio", "nlohmann_json", "spdlog", "openfec", "libopus", "dav1d", "libyuv", "aom") includes("thirdparty") if is_os("windows") then add_requires("vcpkg::libnice", {configs = {shared = false}}) add_requires("openh264 2.1.1", {configs = {shared = false}}) - add_requires("vcpkg::aom 3.8.1") - add_packages("vcpkg::libnice", "openh264", "vcpkg::aom", "cuda") + add_packages("vcpkg::libnice", "openh264", "cuda") add_defines("_WEBSOCKETPP_CPP11_INTERNAL_") add_requires("cuda") elseif is_os("linux") then add_requires("glib", {system = true}) add_requires("vcpkg::libnice", {configs = {shared = false}}) add_requires("openh264 2.1.1", {configs = {shared = false}}) - add_requires("vcpkg::aom 3.8.1") add_packages("glib", "vcpkg::libnice", "openh264", "cuda") add_cxflags("-fPIC") add_syslinks("pthread") elseif is_os("macosx") then add_requires("vcpkg::libnice", {configs = {shared = false}}) add_requires("vcpkg::openh264", {configs = {shared = false}}) - add_requires("vcpkg::aom 3.8.1") - add_packages("vcpkg::libnice", "vcpkg::openh264", "vcpkg::aom") + add_packages("vcpkg::libnice", "vcpkg::openh264") add_ldflags("-Wl,-ld_classic") end @@ -197,7 +194,7 @@ target("projectx") add_links("nice", "glib-2.0", "gio-2.0", "gmodule-2.0", "gobject-2.0", "pcre2-8", "pcre2-16", "pcre2-32", "pcre2-posix", "zlib", "ffi", "libcrypto", "libssl", "intl", "iconv", - "Shell32", "Advapi32", "Dnsapi", "Shlwapi", + "Shell32", "Advapi32", "Dnsapi", "Shlwapi", "Crypt32", "cuda", "nvencodeapi", "nvcuvid", "ws2_32", "Bcrypt", "windowsapp", "User32", "Strmiids", "Mfuuid", "Secur32", "Bcrypt")